回复 2楼 linlulu001
											C代码
/* cdc chunk */
int file_chunk_cdc(int fd, vector* features) {
    unsigned char buf[BUF_MAX_SIZE] = {0};
    unsigned char buf_bz[BUF_MAX_SIZE] = {0};
    unsigned char block_buf[BLOCK_MAX_SIZE * 2] = {0};
    unsigned char last_block_buf[BLOCK_MAX_SIZE * 2] = {0};
    char win_buf[BLOCK_WIN_SIZE + 1] = {0};
    unsigned char md5_str[33] = {0};
    unsigned char adler_pre_char;
    unsigned char md5_checksum[32 + 1] = {0};
    unsigned int bpos = 0;
    unsigned int rwsize = 0, bzsize = 0;
    unsigned int exp_rwsize = BUF_MAX_SIZE;
    unsigned int head, tail;
    unsigned int block_sz = 0, old_block_sz = 0;
    unsigned int hkey = 0;
    int ret = 0;
    feature_t f = 0;
    while(rwsize = read(fd, buf + bpos, exp_rwsize))
    {
        /* last chunk */
        
        if ((rwsize + bpos + block_sz) < BLOCK_MIN_SIZE){
            break;
        }
        head = 0;
        tail = bpos + rwsize;
        /* avoid unnecessary computation and comparsion */
        if (block_sz < (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE))
        {
            old_block_sz = block_sz;
            block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
                    BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : block_sz + tail -head;
  
            memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz);
            head += (block_sz - old_block_sz);
        }
        while ((head + BLOCK_WIN_SIZE) <= tail)
        {
            memcpy(win_buf, buf + head, BLOCK_WIN_SIZE);
            /*
             * Firstly, i think rabinhash is the best. However, it's performance is very bad.
             * After some testing, i found ELF_hash is better both on performance and dedup rate.
             * So, EFL_hash is default. Now, adler_hash as default.
             */
            if (g_rolling_hash)
            {
                hkey = (block_sz == (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? adler32_checksum(win_buf, BLOCK_WIN_SIZE) :
                    adler32_rolling_checksum(hkey, BLOCK_WIN_SIZE, adler_pre_char, buf[head+BLOCK_WIN_SIZE-1]);
            }
            else
                hkey = g_cdc_chunk_hashfunc(win_buf);
            /* get a normal chunk */
            if ((hkey % g_block_size) == CHUNK_CDC_R)
            {
                memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SIZE);
                head += BLOCK_WIN_SIZE;
                block_sz += BLOCK_WIN_SIZE;
                if (block_sz >= BLOCK_MIN_SIZE)
                {
                    md5(block_buf, block_sz, md5_checksum);
                    f = md5_2_feature(md5_checksum);
                    VEC_PUSH_BACK(features, &f);
                    /*
                    if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,
                        md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
                    {
                        perror("dedup_reggile_block_process in file_chunk_cdc");
                        goto _FILE_CHUNK_CDC_EXIT;
                    }
                    */
                    block_sz = 0;
                }
            }
            else
            {
                block_buf[block_sz++] = buf[head++];
                /* get an abnormal chunk */
                if (block_sz >= BLOCK_MAX_SIZE)
                {
                    md5(block_buf, block_sz, md5_checksum);
                    f = md5_2_feature(md5_checksum);
                    VEC_PUSH_BACK(features, &f);
                    /*
                    if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,
                        md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
                    {
                        perror("dedup_reggile_block_process in file_chunk_cdc");
                        goto _FILE_CHUNK_CDC_EXIT;
                    }
                    */
                    block_sz = 0;
                }
            }
            /* avoid unnecessary computation and comparsion */
            if (block_sz == 0)
            {
                block_sz = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
                    BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : tail - head;
                memcpy(block_buf, buf + head, block_sz);
                head = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
                    head + (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE) : tail;
            }
            adler_pre_char = buf[head -1];
        }
        /* read expected data from file to full up buf */
        bpos = tail - head;
        exp_rwsize = BUF_MAX_SIZE - bpos;
        adler_pre_char = buf[head -1];
        memmove(buf, buf + head, bpos);
    }
    /* last chunk */
    int last_block_len = ((rwsize + bpos + block_sz) >= 0) ? rwsize + bpos + block_sz : 0;
    if (last_block_len > 0)
    {
        memcpy(last_block_buf, block_buf, block_sz);
        memcpy(last_block_buf + block_sz, buf, rwsize + bpos);
        md5(last_block_buf, last_block_len, md5_checksum);
        f = md5_2_feature(md5_checksum);
        VEC_PUSH_BACK(features, &f);
    }
_FILE_CHUNK_CDC_EXIT:
    return 0;
}
改写的java代码
package cn.edu.cust.deduple;
import 
import 
import 
import 
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import cn.edu.cust.deduple.utils.Checksum;
import cn.edu.cust.deduple.utils.MapUtils;
import cn.edu.cust.deduple.utils.Md5Util;
public class CDC {
    private static Map<String, Long> fingerPrints = new HashMap<String, Long>();
    
    static final int BUF_MAX_SZ = 128 * 1024;
    static final int BLOCK_MAX_SZ = 4096;
    static final int BLOCK_WIN_SZ = 32;
    static final int BLOCK_MIN_SZ = 64;
    
    static final int BLOCK_SZ = 4096;
    static final int CHUNK_CDC_R = 13;
    
    FileChunks fileChunk(File f) throws IOException {
        byte buf[] = new byte[BUF_MAX_SZ];
        byte block_buf[] = new byte[BLOCK_MAX_SZ + BLOCK_WIN_SZ];
        byte win_buf[] = new byte[BLOCK_WIN_SZ + 1];
        byte adler_pre_char = 0;
        //unsigned char md5_checksum[16 + 1] = {0};
        //unsigned char csum[10 + 1] = {0};
        int bpos = 0;
        int rwsize = 0;
        int exp_rwsize = BUF_MAX_SZ;
        int head, tail;
        int block_sz = 0, old_block_sz = 0;
        long hkey = 0;
        //chunk_block_entry chunk_bentry;
        long offset = 0;
        FileInputStream fin = new FileInputStream(f);
        FileChannel fc = fin.getChannel();
        ByteBuffer bb = ByteBuffer.wrap(buf, bpos, exp_rwsize);
        FileChunks fcs = new FileChunks();
        fcs.fileName = f.getName();
        
        while((rwsize = fc.read(bb)) >= 0) {
            /* last chunk */
            //System.out.println("rwsize:" + rwsize);
            if ((rwsize + bpos + block_sz) < BLOCK_MIN_SZ)
                break;
            head = 0;
            tail = bpos + rwsize;
            /* avoid unnecessary computation and comparsion */
            if (block_sz < (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) {
                old_block_sz = block_sz;
                block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ?
                    BLOCK_MIN_SZ - BLOCK_WIN_SZ : block_sz + tail -head;
                System.arraycopy(buf, head, block_buf, old_block_sz, block_sz - old_block_sz);
                //memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz);
                head += (block_sz - old_block_sz);
            }
            while ((head + BLOCK_WIN_SZ) <= tail) {
                System.arraycopy(buf, head, win_buf, 0, BLOCK_WIN_SZ);
                //memcpy(win_buf, buf + head, BLOCK_WIN_SZ);
                hkey = (block_sz == (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ? Checksum.adler32_checksum(win_buf, BLOCK_WIN_SZ) :
                    Checksum.adler32_rolling_checksum((int)hkey, BLOCK_WIN_SZ, adler_pre_char, buf[head+BLOCK_WIN_SZ-1]);
                //System.out.println("hkey:" + (hkey % BLOCK_SZ));
                /* get a normal chunk, write block info to chunk file */
                if ((hkey % BLOCK_SZ) == CHUNK_CDC_R) {
                    //System.out.println(block_sz + BLOCK_WIN_SZ);
                    System.arraycopy(buf, head, block_buf, block_sz, BLOCK_WIN_SZ);
                    //memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SZ);
                    head += BLOCK_WIN_SZ;
                    block_sz += BLOCK_WIN_SZ;
                    if(block_sz > BLOCK_MAX_SZ){
                        //System.out.println(">4096:" + block_sz);
                    }
                    if (block_sz >= BLOCK_MIN_SZ) {
                        fcs.addChunk(offset, block_sz, block_buf);
                        /*md5(block_buf, block_sz, md5_checksum);
                        uint_2_str(adler32_checksum(block_buf, block_sz), csum);
                        chunk_file_hdr->block_nr++;
                        chunk_bentry.len = block_sz;
                        chunk_bentry.offset = offset;
                        memcpy(chunk_bentry.md5, md5_checksum, 16 + 1);
                        memcpy(chunk_bentry.csum, csum, 10 + 1);
                        rwsize = write(fd_chunk, &chunk_bentry, CHUNK_BLOCK_ENTRY_SZ);
                        if (rwsize == -1 || rwsize != CHUNK_BLOCK_ENTRY_SZ)
                            return -1;*/
                        offset += block_sz;
                        block_sz = 0;
                    }
                } else {
                    block_buf[block_sz++] = buf[head++];
                    /* get an abnormal chunk, write block info to chunk file */
                    if (block_sz >= BLOCK_MAX_SZ) {
                        fcs.addChunk(offset, block_sz, block_buf);
                        /*md5(block_buf, block_sz, md5_checksum);
                        uint_2_str(adler32_checksum(block_buf, block_sz), csum);
                        chunk_file_hdr->block_nr++;
                        chunk_bentry.len = block_sz;
                        chunk_bentry.offset = offset;
                        memcpy(chunk_bentry.md5, md5_checksum, 16+1);
                        memcpy(chunk_bentry.csum, csum, 10 + 1);
                        rwsize = write(fd_chunk, &chunk_bentry, CHUNK_BLOCK_ENTRY_SZ);
                        if (rwsize == -1 || rwsize != CHUNK_BLOCK_ENTRY_SZ)
                            return -1;*/
                        offset += block_sz;
                        block_sz = 0;
                    }
                }
                /* avoid unnecessary computation and comparsion */
                if (block_sz == 0) {
                    block_sz = ((tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ?
                        BLOCK_MIN_SZ - BLOCK_WIN_SZ : tail - head;
                    System.arraycopy(buf, head, block_buf, 0, block_sz);
                    //memcpy(block_buf, buf + head, block_sz);
                    head = ((tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ?
                        head + (BLOCK_MIN_SZ - BLOCK_WIN_SZ) : tail;
                }
                adler_pre_char = buf[head - 1];
            }
            /* read expected data from file to full up buf */
            bpos = tail - head;
            exp_rwsize = BUF_MAX_SZ - bpos;
            adler_pre_char = buf[head - 1];
            System.arraycopy(buf, head, buf, 0, bpos);
            //memmove(buf, buf + head, bpos);
            bb = ByteBuffer.wrap(buf, bpos, exp_rwsize);
        }
        fin.close();
        return fcs;
        /*if (rwsize == -1)
            return -1;
        return 0;*/
    }
}