/* hzip: file compression for sorted dictionaries with optional encryption, * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ #include #include #include #define CODELEN 65536 #define BUFSIZE 65536 #define EXTENSION ".hz" #define ESCAPE 31 #define MAGIC "hz0" #define MAGIC_ENCRYPTED "hz1" #define DESC "hzip - dictionary compression utility\n" \ "Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \ " -P password encrypted compression\n" \ " -h display this help and exit\n" enum { code_LEAF, code_TERM, code_NODE}; struct item { unsigned short word; int count; char type; struct item * left; struct item * right; }; int fail(const char * err, const char * par) { fprintf(stderr, err, par); return 1; } void code2table(struct item * tree, char **table, char * code, int deep) { int first = 0; if (!code) { first = 1; code = malloc(CODELEN); } code[deep] = '1'; if (tree->left) code2table(tree->left, table, code, deep + 1); if (tree->type != code_NODE) { int i = tree->word; code[deep] = '\0'; if (tree->type == code_TERM) i = CODELEN; /* terminal code */ table[i] = malloc(deep + 1); strcpy(table[i], code); } code[deep] = '0'; if (tree->right) code2table(tree->right, table, code, deep + 1); if (first) free(code); } struct item * newitem(int c, struct item * l, struct item * r, int t) { struct item * ni = (struct item *) malloc(sizeof(struct item)); ni->type = t; ni->word = 0; ni->count = c; ni->left = l; ni->right = r; return ni; } /* return length of the freq array */ int get_freqdata(struct item *** dest, FILE * f, unsigned short * termword) { int freq[CODELEN]; int i, j, k, n; union { char c[2]; unsigned short word; } u; for (i = 0; i < CODELEN; i++) freq[i] = 0; while((j = getc(f)) != -1 && (k = getc(f)) != -1) { u.c[0] = j; u.c[1] = k; freq[u.word]++; } if (j != -1) { u.c[0] = 1; u.c[1] = j; } else { u.c[0] = 0; u.c[1] = 0; } *dest = (struct item **) malloc((CODELEN + 1) * sizeof(struct item *)); if (!*dest) return -1; for (i = 0, n = 0; i < CODELEN; i++) if (freq[i]) { (*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF); (*dest)[n]->word = i; n++; } /* terminal sequence (also contains the last odd byte of the file) */ (*dest)[n] = newitem(1, NULL, NULL, code_TERM); *termword = u.word; return n + 1; } void get_codetable(struct item **l, int n, char ** table) { int i; while (n > 1) { int min = 0; int mi2 = 1; for (i = 1; i < n; i++) { if (l[i]->count < l[min]->count) { mi2 = min; min = i; } else if (l[i]->count < l[mi2]->count) mi2 = i; } l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE); for (i = mi2 + 1; i < n; i++) l[i - 1] = l[i]; n--; } code2table(l[0], table, NULL, 0); } int write_bits(FILE *f, char * bitbuf, int *bits, char * code) { while (*code) { int b = (*bits) % 8; if (!b) bitbuf[(*bits) / 8] = ((*code) - '0') << 7; else bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b)); (*bits)++; code++; if (*bits == BUFSIZE * 8) { if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f)) return 1; *bits = 0; } } return 0; } int encode_file(char ** table, int n, FILE *f, FILE *f2, unsigned short tw, char * key) { char bitbuf[BUFSIZE]; int i, bits = 0; unsigned char cl, ch; int cx[2]; union { char c[2]; unsigned short word; } u; char * enc = key; /* header and codes */ fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */ cl = (unsigned char) (n & 0x00ff); ch = (unsigned char) (n >> 8); if (key) { unsigned char cs; for (cs = 0; *enc; enc++) cs ^= *enc; fprintf(f2, "%c", cs); /* 1-byte check sum */ enc = key; ch ^= *enc; if ((*(++enc)) == '\0') enc = key; cl ^= *enc; } fprintf(f2, "%c%c", ch, cl); /* upper and lower byte of record count */ for (i = 0; i < BUFSIZE; i++) bitbuf[i] = '\0'; for (i = 0; i < CODELEN + 1; i++) if (table[i]) { int nmemb; u.word = (unsigned short) i; if (i == CODELEN) u.word = tw; if (key) { if (*(++enc) == '\0') enc = key; u.c[0] ^= *enc; if (*(++enc) == '\0') enc = key; u.c[1] ^= *enc; } fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */ bits = 0; if (write_bits(f2, bitbuf, &bits, table[i]) != 0) return 1; if (key) { if (*(++enc) == '\0') enc = key; fprintf(f2, "%c", ((unsigned char) bits) ^ *enc); for (cl = 0; cl <= bits/8; cl++) { if (*(++enc) == '\0') enc = key; bitbuf[cl] ^= *enc; } } else fprintf(f2, "%c", (unsigned char) bits); /* 1-byte code length */ nmemb = bits/8 + 1; if (fwrite(bitbuf, 1, bits/8 + 1, f2) != nmemb) /* x-byte code */ return 1; } /* file encoding */ bits = 0; while((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) { u.c[0] = cx[0]; u.c[1] = cx[1]; if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0) return 1; } /* terminal suffixes */ if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0) return 1; if (bits > 0) { int nmemb = bits/8 + 1; if (fwrite(bitbuf, 1, nmemb, f2) != nmemb) return 1; } return 0; } int prefixcompress(FILE *f, FILE *tempfile) { char buf[BUFSIZE]; char buf2[BUFSIZE * 2]; char prev[BUFSIZE]; int prevlen = 0; while(fgets(buf,BUFSIZE,f)) { int i, j, k, m, c=0; int pfx = prevlen; char * p = buf2; m = j = 0; for (i = 0; buf[i]; i++) { if ((pfx > 0) && (buf[i] == prev[i])) { j++; } else pfx = 0; } if (i > 0 && buf[i - 1] == '\n') { if (j == i) j--; /* line duplicate */ if (j > 29) j = 29; c = j; if (c == '\t') c = 30; /* common suffix */ for (; buf[i - m - 2] == prev[prevlen - m - 2] && m < i - j - 1 && m < 15; m++); if (m == 1) m = 0; } else { j = 0; m = -1; } for (k = j; k < i - m - 1; k++, p++) { if (((unsigned char) buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') { *p = ESCAPE; p++; } *p = buf[k]; } if (m > 0) { *p = m + 31; /* 33-46 */ p++; } if (i > 0 && buf[i - 1] == '\n') { size_t nmemb = p - buf2 + 1; *p = c; if (fwrite(buf2, 1, nmemb, tempfile) != nmemb) return 1; } else { size_t nmemb = p - buf2; if (fwrite(buf2, 1, nmemb, tempfile) != nmemb) return 1; } memcpy(prev, buf, i); prevlen = i; } return 0; } int hzip(const char * filename, char * key) { struct item ** list; char * table[CODELEN + 1]; int n; char out[BUFSIZE]; FILE *f, *f2, *tempfile; unsigned short termword; strcpy(out, filename); strcat(out, EXTENSION); f = fopen(filename, "r"); if (!f) return fail("hzip: %s: Permission denied\n", filename); tempfile = tmpfile(); if (!tempfile) { fclose(f); return fail("hzip: cannot create temporary file\n", NULL); } f2 = fopen(out, "wb"); if (!f2) { fclose(tempfile); fclose(f); return fail("hzip: %s: Permission denied\n", out); } for (n = 0; n < CODELEN; n++) table[n] = NULL; if (prefixcompress(f, tempfile) != 0) { fclose(f2); fclose(tempfile); fclose(f); return fail("hzip: cannot write file\n", NULL); } rewind(tempfile); n = get_freqdata(&list, tempfile, &termword); get_codetable(list, n, table); rewind(tempfile); n = encode_file(table, n, tempfile, f2, termword, key); fclose(f2); fclose(tempfile); fclose(f); if (n != 0) return fail("hzip: cannot write file\n", NULL); return n; } int main(int argc, char** argv) { int i, j = 0; char * key = NULL; for (i = 1; i < argc; i++) { if (*(argv[i]) == '-') { if (*(argv[i] + 1) == 'h') return fail(DESC, NULL); if (*(argv[i] + 1) == 'P') { if (i + 1 == argc) return fail("hzip: missing password\n", NULL); key = argv[i + 1]; i++; continue; } return fail("hzip: no such option: %s\n", argv[i]); } else if (hzip(argv[i], key) != 0) return 1; else j = 1; } if (j == 0) return fail("hzip: need a filename parameter\n", NULL); return 0; }