![]() |
#2
远方的狼A2016-08-02 17:49
|
如果可以,可以补发代码
![]() |
#2
远方的狼A2016-08-02 17:49
你不发出来怎么看呢;
|
![]() |
#3
hyj06162016-08-02 19:23
C代码
int file_chunk_cdc(int fd, vector* features) { unsigned char buf[BUF_MAX_SIZE] = {0}; unsigned char buf_bz[BUF_MAX_SIZE] = {0}; unsigned char block_buf[BLOCK_MAX_SIZE * 2] = {0}; unsigned char last_block_buf[BLOCK_MAX_SIZE * 2] = {0}; char win_buf[BLOCK_WIN_SIZE + 1] = {0}; unsigned char md5_str[33] = {0}; unsigned char adler_pre_char; unsigned char md5_checksum[32 + 1] = {0}; unsigned int bpos = 0; unsigned int rwsize = 0, bzsize = 0; unsigned int exp_rwsize = BUF_MAX_SIZE; unsigned int head, tail; unsigned int block_sz = 0, old_block_sz = 0; unsigned int hkey = 0; int ret = 0; feature_t f = 0; while(rwsize = read(fd, buf + bpos, exp_rwsize)) { /* last chunk */ if ((rwsize + bpos + block_sz) < BLOCK_MIN_SIZE){ break; } head = 0; tail = bpos + rwsize; /* avoid unnecessary computation and comparsion */ if (block_sz < (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) { old_block_sz = block_sz; block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : block_sz + tail -head; memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz); head += (block_sz - old_block_sz); } while ((head + BLOCK_WIN_SIZE) <= tail) { memcpy(win_buf, buf + head, BLOCK_WIN_SIZE); /* * Firstly, i think rabinhash is the best. However, it's performance is very bad. * After some testing, i found ELF_hash is better both on performance and dedup rate. * So, EFL_hash is default. Now, adler_hash as default. */ if (g_rolling_hash) { hkey = (block_sz == (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? adler32_checksum(win_buf, BLOCK_WIN_SIZE) : adler32_rolling_checksum(hkey, BLOCK_WIN_SIZE, adler_pre_char, buf[head+BLOCK_WIN_SIZE-1]); } else hkey = g_cdc_chunk_hashfunc(win_buf); /* get a normal chunk */ if ((hkey % g_block_size) == CHUNK_CDC_R) { memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SIZE); head += BLOCK_WIN_SIZE; block_sz += BLOCK_WIN_SIZE; if (block_sz >= BLOCK_MIN_SIZE) { md5(block_buf, block_sz, md5_checksum); f = md5_2_feature(md5_checksum); VEC_PUSH_BACK(features, &f); /* if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz, md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable))) { perror("dedup_reggile_block_process in file_chunk_cdc"); goto _FILE_CHUNK_CDC_EXIT; } */ block_sz = 0; } } else { block_buf[block_sz++] = buf[head++]; /* get an abnormal chunk */ if (block_sz >= BLOCK_MAX_SIZE) { md5(block_buf, block_sz, md5_checksum); f = md5_2_feature(md5_checksum); VEC_PUSH_BACK(features, &f); /* if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz, md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable))) { perror("dedup_reggile_block_process in file_chunk_cdc"); goto _FILE_CHUNK_CDC_EXIT; } */ block_sz = 0; } } /* avoid unnecessary computation and comparsion */ if (block_sz == 0) { block_sz = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : tail - head; memcpy(block_buf, buf + head, block_sz); head = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? head + (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE) : tail; } adler_pre_char = buf[head -1]; } /* read expected data from file to full up buf */ bpos = tail - head; exp_rwsize = BUF_MAX_SIZE - bpos; adler_pre_char = buf[head -1]; memmove(buf, buf + head, bpos); } /* last chunk */ int last_block_len = ((rwsize + bpos + block_sz) >= 0) ? rwsize + bpos + block_sz : 0; if (last_block_len > 0) { memcpy(last_block_buf, block_buf, block_sz); memcpy(last_block_buf + block_sz, buf, rwsize + bpos); md5(last_block_buf, last_block_len, md5_checksum); f = md5_2_feature(md5_checksum); VEC_PUSH_BACK(features, &f); } _FILE_CHUNK_CDC_EXIT: return 0; } /* slide block chunk */ int file_chunk_sb(int fd, vector* features) { char buf[BUF_MAX_SIZE] = {0}; char buf_bz[BUF_MAX_SIZE] = {0}; char win_buf[BLOCK_MAX_SIZE * 2] = {0}; char block_buf[BLOCK_MAX_SIZE * 2] = {0}; char adler_pre_char; unsigned char md5_checksum[32 + 1] = {0}; unsigned char md5_checksum1[32 + 1] = {0}; char crc_checksum[16] = {0}; unsigned int bpos = 0; unsigned int slide_sz = 0; unsigned int rwsize = 0, bzsize = 0, bzsize_f = 0; unsigned int exp_rwsize = BUF_MAX_SIZE; unsigned int head, tail; unsigned int hkey = 0; unsigned int bflag = 0; int ret = 0; hashtable* sb_htable = create_hashtable(g_htab_bucket_nr); hashtable* sb_htable_crc = create_hashtable(g_htab_bucket_nr); if (NULL == sb_htable_crc || sb_htable == NULL) return -1; feature_t f, f1; while(rwsize = read(fd, buf + bpos, exp_rwsize)) { /* last chunk */ /* if ((rwsize + bpos + slide_sz) < g_block_size) break; */ head = 0; tail = bpos + rwsize; while ((head + g_block_size) <= tail) { memcpy(win_buf, buf + head, g_block_size); hkey = (slide_sz == 0) ? adler32_checksum(win_buf, g_block_size) : adler32_rolling_checksum(hkey, g_block_size, adler_pre_char, buf[head+g_block_size-1]); uint_2_str(hkey, crc_checksum); /* bflag: 0, both CRC and MD5 are not idenitical 1, both CRC and MD5 are identical 2, CRC is identical and MD5 is not */ bflag = 0; /* this block maybe is duplicate */ bzsize = g_block_size; if (hash_exist(sb_htable_crc, crc_checksum)) { bflag = 2; md5((unsigned char*)win_buf, bzsize, md5_checksum); f = md5_2_feature(md5_checksum); md5_2_str(md5_checksum); if (hash_exist(sb_htable, (char*)md5_checksum)) { /* insert fragment */ if (slide_sz != 0) { md5((unsigned char*)block_buf, slide_sz, md5_checksum1); f1 = md5_2_feature(md5_checksum1); VEC_PUSH_BACK(features, &f1); /* if (0 != (ret = dedup_regfile_block_process(block_buf, slide_sz, md5_checksum1, fd_ldata, fd_bdata, pos, block_num, metadata, htable))) { perror("dedup_regfile_block_process in file_chunk_sb"); goto _FILE_CHUNK_SB_EXIT; } */ } VEC_PUSH_BACK(features, &f); /* insert fixed-size block */ /* if (0 != (ret = dedup_regfile_block_process(win_buf, bzsize, md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable))) { perror("dedup_regfile_block_process in file_chunk_sb"); goto _FILE_CHUNK_SB_EXIT; } */ head += g_block_size; slide_sz = 0; bflag = 1; } } /* this block is not duplicate */ if (bflag != 1) { block_buf[slide_sz++] = buf[head++]; if (slide_sz == g_block_size) { bzsize = g_block_size; /* calculate checksum and check in */ hkey = adler32_checksum(block_buf, bzsize); uint_2_str(hkey, crc_checksum); hash_checkin(sb_htable_crc, crc_checksum); md5((unsigned char*)block_buf, bzsize, md5_checksum); f = md5_2_feature(md5_checksum); VEC_PUSH_BACK(features, &f); /* if (0 != (ret = dedup_regfile_block_process(block_buf, bzsize, md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable))) { perror("dedup_regfile_block_process in file_chunk_sb"); goto _FILE_CHUNK_SB_EXIT; } */ slide_sz = 0; } } adler_pre_char = buf[head - 1]; } /* read expected data from file to full up buf */ bpos = tail - head; exp_rwsize = BUF_MAX_SIZE - bpos; adler_pre_char = buf[head - 1]; memmove(buf, buf + head, bpos); } /* last chunk */ /* *last_block_len = ((rwsize + bpos + slide_sz) > 0) ? rwsize + bpos + slide_sz : 0; if (*last_block_len > 0) { memcpy(last_block_buf, block_buf, slide_sz); memcpy(last_block_buf + slide_sz, buf, rwsize + bpos); } */ _FILE_CHUNK_SB_EXIT: lseek(fd, 0, SEEK_SET); hash_free(sb_htable); hash_free(sb_htable_crc); return 0; } java代码 package cn.edu.cust.deduple; import import import import import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.security.NoSuchAlgorithmException; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import cn.edu.cust.deduple.utils.Checksum; import cn.edu.cust.deduple.utils.MapUtils; import cn.edu.cust.deduple.utils.Md5Util; /** * @ClassName: CDC * @Description: * @author: 王大鸟 * @date: 2016/5/18 */ public class CDC { /** * rabin指纹之除数 */ private static final int D = 5; /** * rabin指纹之商 */ private static final int F = 3; /** * 最大值 */ private static final int MAX = 8 * 1024; /** * 最小值 */ private static final int MIN = 4 * 1024; /** * 滑动窗口大小 */ private static final int WINLEN = 4 * 1024; private static Map<String, Long> fingerPrints = new HashMap<String, Long>(); static final int BUF_MAX_SZ = 128 * 1024; static final int BLOCK_MAX_SZ = 4096; static final int BLOCK_WIN_SZ = 32; static final int BLOCK_MIN_SZ = 64; static final int BLOCK_SZ = 4096; static final int CHUNK_CDC_R = 13; FileChunks fileChunk(File f) throws IOException { byte buf[] = new byte[BUF_MAX_SZ]; byte block_buf[] = new byte[BLOCK_MAX_SZ + BLOCK_WIN_SZ]; byte win_buf[] = new byte[BLOCK_WIN_SZ + 1]; byte adler_pre_char = 0; //unsigned char md5_checksum[16 + 1] = {0}; //unsigned char csum[10 + 1] = {0}; int bpos = 0; int rwsize = 0; int exp_rwsize = BUF_MAX_SZ; int head, tail; int block_sz = 0, old_block_sz = 0; long hkey = 0; //chunk_block_entry chunk_bentry; long offset = 0; FileInputStream fin = new FileInputStream(f); FileChannel fc = fin.getChannel(); ByteBuffer bb = ByteBuffer.wrap(buf, bpos, exp_rwsize); FileChunks fcs = new FileChunks(); while((rwsize = fc.read(bb)) >= 0) { /* last chunk */ System.out.println("rwsize:" + rwsize); if ((rwsize + bpos + block_sz) < BLOCK_MIN_SZ) break; head = 0; tail = bpos + rwsize; /* avoid unnecessary computation and comparsion */ if (block_sz < (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) { old_block_sz = block_sz; block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ? BLOCK_MIN_SZ - BLOCK_WIN_SZ : block_sz + tail -head; System.arraycopy(buf, head, block_buf, old_block_sz, block_sz - old_block_sz); //memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz); head += (block_sz - old_block_sz); } while ((head + BLOCK_WIN_SZ) <= tail) { System.arraycopy(buf, head, win_buf, 0, BLOCK_WIN_SZ); //memcpy(win_buf, buf + head, BLOCK_WIN_SZ); hkey = (block_sz == (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ? Checksum.adler32_checksum(win_buf, BLOCK_WIN_SZ) : Checksum.adler32_rolling_checksum((int)hkey, BLOCK_WIN_SZ, adler_pre_char, buf[head+BLOCK_WIN_SZ-1]); //System.out.println("hkey:" + (hkey % BLOCK_SZ)); /* get a normal chunk, write block info to chunk file */ if ((hkey % BLOCK_SZ) == CHUNK_CDC_R) { //System.out.println(block_sz + BLOCK_WIN_SZ); System.arraycopy(buf, head, block_buf, block_sz, BLOCK_WIN_SZ); //memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SZ); head += BLOCK_WIN_SZ; block_sz += BLOCK_WIN_SZ; if(block_sz > BLOCK_MAX_SZ){ System.out.println(">4096:" + block_sz); } if (block_sz >= BLOCK_MIN_SZ) { fcs.addChunk(offset, block_sz, block_buf); /*md5(block_buf, block_sz, md5_checksum); uint_2_str(adler32_checksum(block_buf, block_sz), csum); chunk_file_hdr->block_nr++; chunk_bentry.len = block_sz; chunk_bentry.offset = offset; memcpy(chunk_bentry.md5, md5_checksum, 16 + 1); memcpy(chunk_bentry.csum, csum, 10 + 1); rwsize = write(fd_chunk, &chunk_bentry, CHUNK_BLOCK_ENTRY_SZ); if (rwsize == -1 || rwsize != CHUNK_BLOCK_ENTRY_SZ) return -1;*/ offset += block_sz; block_sz = 0; } } else { block_buf[block_sz++] = buf[head++]; /* get an abnormal chunk, write block info to chunk file */ if (block_sz >= BLOCK_MAX_SZ) { fcs.addChunk(offset, block_sz, block_buf); /*md5(block_buf, block_sz, md5_checksum); uint_2_str(adler32_checksum(block_buf, block_sz), csum); chunk_file_hdr->block_nr++; chunk_bentry.len = block_sz; chunk_bentry.offset = offset; memcpy(chunk_bentry.md5, md5_checksum, 16+1); memcpy(chunk_bentry.csum, csum, 10 + 1); rwsize = write(fd_chunk, &chunk_bentry, CHUNK_BLOCK_ENTRY_SZ); if (rwsize == -1 || rwsize != CHUNK_BLOCK_ENTRY_SZ) return -1;*/ offset += block_sz; block_sz = 0; } } /* avoid unnecessary computation and comparsion */ if (block_sz == 0) { block_sz = ((tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ? BLOCK_MIN_SZ - BLOCK_WIN_SZ : tail - head; System.arraycopy(buf, head, block_buf, 0, block_sz); //memcpy(block_buf, buf + head, block_sz); head = ((tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ? head + (BLOCK_MIN_SZ - BLOCK_WIN_SZ) : tail; } adler_pre_char = buf[head - 1]; } /* read expected data from file to full up buf */ bpos = tail - head; exp_rwsize = BUF_MAX_SZ - bpos; adler_pre_char = buf[head - 1]; System.arraycopy(buf, head, buf, 0, bpos); //memmove(buf, buf + head, bpos); bb = ByteBuffer.wrap(buf, bpos, exp_rwsize); } fin.close(); return fcs; /*if (rwsize == -1) return -1; return 0;*/ } /** * CDC算法实现 * * @param file * @throws IOException * @throws NoSuchAlgorithmException */ public void process(File file) throws IOException, NoSuchAlgorithmException { byte[] win_buf = new byte[WINLEN]; long win_tail = 0; String fingerPrint = null; byte[] buf = new byte[MAX]; RandomAccessFile randomAccessFile = new RandomAccessFile(file, "r"); long fileLength = randomAccessFile.length(); while (randomAccessFile.read(win_buf) > 0) { long filePointer = randomAccessFile.getFilePointer(); //如果大于最大值 if (filePointer - win_tail > MAX) { randomAccessFile.seek(win_tail); randomAccessFile.read(buf); fingerPrint = Md5Util.getMD5(buf); MapUtils.judgeKey(fingerPrints, fingerPrint); win_tail = filePointer; continue; } //满足rabin指纹 if ((Arrays.hashCode(win_buf) % D == F) && !((filePointer - win_tail) < MIN)) { offset(randomAccessFile, filePointer, win_tail); win_tail = filePointer; } else if (filePointer == fileLength) { offset(randomAccessFile, filePointer, win_tail); break; } else { randomAccessFile.seek(filePointer - (4 * 1024) + 1); } } } /** * 重复代码封装 * * @param randomAccessFile * @param filePointer * @param win_tail * @throws IOException * @throws NoSuchAlgorithmException */ public void offset(RandomAccessFile randomAccessFile, long filePointer, long win_tail) throws IOException, NoSuchAlgorithmException { byte[] tmpbytes = new byte[Integer.parseInt(String.valueOf(filePointer - win_tail))]; randomAccessFile.seek(win_tail); randomAccessFile.read(tmpbytes); String fingerPrint = Md5Util.getMD5(tmpbytes); MapUtils.judgeKey(fingerPrints, fingerPrint); } public static void main(String[] args) { try{ CDC cdc = new CDC(); FileChunks fcs = cdc.fileChunk(new File("/home/qh/下载/SPAS data set/2.txt")); //fcs.printChunks(); }catch(IOException ex){ ex.printStackTrace(); } } } |
![]() |
#4
hyj06162016-08-02 19:24
回复 楼主 hyj0616
已发,请帮忙!
|
![]() |
#5
砖家的谎言2016-08-06 09:55
嗯,发到我们群里面帮你看看
|
![]() |
#6
仍去寻酒2016-08-24 04:33
握草,真长,,好久没看c 表示不懂
![]() ![]() |