Huffman Coding

/* Huffman Coding in C . This program reads a text file named on the command line, then compresses it using Huffman coding. The file is read twice, once to determine the frequencies of the characters, and again to do the actual compression. */ #include < stdio.h> #include < stdlib.h> #include < string.h> #include < time.h> /* there are 256 possible characters */ #define NUM_CHARS 256 /* tree node, heap node */ typedef struct _treenode treenode; struct _treenode { int freq; /* frequency; is the priority for heap */ unsigned char ch; /* character, if any */ treenode *left, /* left child of Huffman tree (not heap!) */ *right; /* right child of Huffman tree */ }; /* this is a priority queue implemented as a binary heap */ typedef struct _pq { int heap_size; treenode *A[NUM_CHARS]; } PQ; /* create an empty queue */ void create_pq (PQ *p) { p->heap_size = 0; } /* this heap node's parent */ int parent (int i) { return (i-1) / 2; } /* this heap node's left kid */ int left (int i) {

r->freq is the priority */ void insert_pq (PQ *p. if needed. . heapify (p. assuming left(i) and * right(i) are heaps */ void heapify (PQ *p. else smallest = i. } } /* insert an element into the priority queue. p->A[i] = p->A[smallest]. } /* makes the subheap with root i into a heap . */ if (smallest != i) { t = p->A[i]. smallest). l = left (i). } /* this heap node's right kid */ int right (int i) { return i * 2 + 2.1.return i * 2 + 1. int i) { int l. /* swap the parent with the smallest. i = p->heap_size . p->heap_size++. treenode *t. r. left. and right */ if (l < p->heap_size && p->A[l]->freq < p->A[i]->freq) smallest = l. p->A[smallest] = t. /* we would like to place r at the end of the array. /* find the smallest of parent. treenode *r) { int i. smallest. r = right (i). if (r < p->heap_size && p->A[r]->freq < p->A[smallest]->freq) smallest = r.

n++) { . } /* get return value out of the root */ r = p->A[0].* but this might violate the heap property. } /* read the file.e. exit (1). i = parent (i). make the root a heap */ heapify (p. /* one less thing in queue */ p->heap_size--. } p->A[i] = r. we'll start * at the end and work our way up */ while ((i > 0) && (p->A[parent(i)]->freq > r->freq)) { p->A[i] = p->A[parent(i)].. if (p->heap_size == 0) { printf ("heap underflow!\n"). n. /* n will count characters */ for (n=0. } /* remove the element at head of the queue (i. unsigned int v[]) { int r.. with minimum frequency) */ treenode *extract_min_pq (PQ *p) { treenode *r. 0). /* take the last and stick it in the root (just like heapsort) */ p->A[0] = p->A[p->heap_size-1]. computing the frequencies for each character * and placing them in v[] */ unsigned int get_frequencies (FILE *f. /* left and right are a heap. return r.

i++) { x = malloc (sizeof (treenode)). /* its a leaf of the Huffman tree */ x->left = NULL. treenode *x. } return n. x->ch = (char) i. x->right = NULL. /* make an empty queue */ create_pq (&p). the heap is a "forest" of singleton trees */ . /* put this node into the heap */ insert_pq (&p. } /* make the huffman tree from frequencies in freq[] (Huffman's Algorithm) */ treenode *build_huffman (unsigned int freqs[]) { int i. converts to int */ r = fgetc (f). x->freq = freqs[i]. *y. x). PQ p. n. i< NUM_CHARS. *z./* fgetc() gets an unsigned char. /* for each character. /* no more? get out of loop */ if (feof (f)) break. make a heap/tree node with its value * and frequency */ for (i=0. } /* at this point. /* one more of this character */ v[r]++.

heap_size-1.n = p. /* z's frequency is the sum of x and y */ z->freq = x->freq + y->freq. /* heap_size isn't loop invariant! */ /* if we insert two things and remove one each time. there will be * one tree left in the heap */ for (i=0. } /* traverse the Huffman tree. building up the codes in codes[] */ void traverse (treenode *r. i++) { /* make a new node z from the two least frequent * nodes x and y */ z = malloc (sizeof (treenode)). z->right = y. /* current level in Huffman tree */ char code_so_far[]. x = extract_min_pq (&p). . /* put this back in the queue */ insert_pq (&p. y = extract_min_pq (&p). z). */ if ((r->left == NULL) && (r->right == NULL)) { /* put in a null terminator */ code_so_far[level] = 0. /* code string up to this point in tree */ char *codes[]) {/* array of codes */ /* if we're at a leaf node. i< n. * at the end of heap_size-1 iterations. } /* return the only thing left in the queue. z->left = x. the whole Huffman tree */ return extract_min_pq (&p). /* root of this (sub)tree */ int level.

} else { /* not at a leaf node. nbytes++. } . code_so_far. nbits = 0. level+1. go left with bit 0 */ code_so_far[level] = '0'. codes). /* one more bit */ nbits++. nbytes. /* put a one on the end of this byte if b is '1' */ if (b == '1') current_byte |= 1. /* enough bits? write out the byte */ if (nbits == 8) { fputc (current_byte. traverse (r->left. } } /* global variables. /* go right with bit 1 */ code_so_far[level] = '1'. a necessary evil */ int nbits. traverse (r->right. /* output a single bit to an open file */ void bitout (FILE *f. char b) { /* shift current byte left one */ current_byte < < = 1. f). current_byte. codes). current_byte = 0. level+1./* make a copy of the code and put it in the array */ codes[r->ch] = strdup (code_so_far). code_so_far.

1 per char */ code[NUM_CHARS]. /* number of bytes in file */ freqs[NUM_CHARS]. *g. } /* finish off the last byte */ while (nbits) bitout (outfile.} /* using the codes in codes[]. /* root of Huffman tree */ unsigned int n.) { /* get a char */ ch = fgetc (infile). encode the file in infile. treenode *r. *s). /* what to call output file */ /* hassle user */ .. FILE *outfile. /* initialize globals for bitout() */ current_byte = 0. /* frequency of each char */ char *codes[NUM_CHARS]. s++) bitout (outfile. /* array of codes. /* put the corresponding bitstring on outfile */ for (s=codes[ch]. nbits = 0. char *s. *s. /* continue until end of file */ for (. char *argv[]) { FILE *f. nbytes = 0. /* a place to hold one code */ fname[100]. } /* main program */ int main (int argc. writing * the result on outfile */ void encode_file (FILE *infile. char *codes[]) { unsigned char ch. '0'). if (feof (infile)) break.

g = fopen (fname. argv[1]). exit (1). exit (1). /* traverse the tree. fclose (f). } /* write frequencies to file so they can be reproduced */ fwrite (freqs. code. sizeof (int).if (argc != 2) { fprintf (stderr. } /* set all frequencies to zero */ memset (freqs. NUM_CHARS. sizeof (freqs)). /* write number of characters to file as binary int */ . "Usage: %s < filename>\n". "r"). 0. /* open command line argument file */ f = fopen (argv[1]. "%s. exit (1). g). argv[0]). 0. if (!f) { perror (argv[1]). filling codes[] with the codes */ traverse (r. if (!g) { perror (fname). freqs). } /* compute frequencies from this file */ n = get_frequencies (f. codes). "w").huf */ sprintf (fname. /* name the output file something.huf". /* make the huffman tree */ r = build_huffman (freqs).

(float) nbytes / (float) n.fwrite (&n.2f%% of %s\n". "r"). fname. fclose (f). sizeof (int). codes). } /* encode f to g with codes[] */ encode_file (f. if (!f) { perror (argv[1]). } . /* brag */ printf ("%s is %0. 1. argv[1]). g). /* open input file again */ f = fopen (argv[1]. exit (0). fclose (g). exit (1). g.

Sign up to vote on this title
UsefulNot useful