# A simple implementation of Huffman coding. # # You have a string to encode. The following # steps must be performed to encode it: # # * count frequency with freq_count # * create an initial forest (of single leaf trees) with init_forest # * build a complete huffman encoding tree with build_huffman_tree # * calculate the two-way encodings (to encode and decode strings) # with set_char_encoding # * encode the string using the calculated encoding with encode_string # * then, you can also decode the encoded string with decode_string # # See the driver code in the bottom of the file for an example # use warnings; use strict; use Data::Dumper; use Benchmark; # Returns a frequency count hash of a string # sub freq_count { my $string = $_[0]; my %dict; foreach (split(//, $string)) { $dict{$_}++; } return \%dict; } # Given a frequency count hash, creates an initial # forest (an array of single node trees). # # Each tree simply contains one char, with its frequency # count # sub init_forest { my $r_dict = $_[0]; my @forest; foreach my $char (keys %$r_dict) { my $tree = { CHAR => $char, COUNT => $r_dict->{$char}, LEFT => undef, RIGHT => undef}; push(@forest, $tree); } return \@forest; } # Builds a huffman tree from an initial forest # # Iteratively choose the two trees from the forest # with minimal counts and merge them into one tree (create # a new node that has both trees as children). The single # tree left in the end is the Huffman tree for this initial # forest # # Note: the whole forest sorting part can probably be # more efficient with a heap data structure # sub build_huffman_tree { my $r_unsorted_forest = $_[0]; # initial sort, by COUNT # using the Schwartzian transform my @forest = map {$_->[1]} sort {$a->[0] <=> $b->[0]} map {[$_->{COUNT}, $_]} @$r_unsorted_forest; # do the following until there is only one tree # left in the forest # until (@forest == 1) { # take the 2 lowest-count trees my ($t1, $t2) = splice(@forest, 0, 2); # unite them my $new_tree = {CHAR => "", # the char only plays role in leaves COUNT => $t1->{COUNT} + $t2->{COUNT}, LEFT => $t1, RIGHT => $t2}; # insert the new tree back into the forest (but keep the # forest sorted !) my $i = 0; while (defined($forest[$i]) and $new_tree->{COUNT} > $forest[$i]->{COUNT}) { $i++; } splice(@forest, $i, 0, $new_tree); } # return the tree return $forest[0]; } # Given a huffman tree, creates the binary encodings for # the chars. Returns both an encoding and a decoding hashes # # Traverses the tree, remembering the path from the root # (0 is a left turn, 1 a right turn), and adds an encoding # and a decoding entry each time a leaf is encountered # sub set_char_encoding { my $r_tree = $_[0]; my $r_code = {}; my $r_rev_code = {}; my $bitstring = ""; aux_set_char_encoding($r_tree, $r_code, $r_rev_code, $bitstring); return ($r_code, $r_rev_code); } # A helper function for set_char_encoding # Note: recursive (in-order traversal of the tree) # sub aux_set_char_encoding { my $r_tree = $_[0]; my $r_code = $_[1]; my $r_rev_code = $_[2]; my $bitstring = $_[3]; # complete encoding if a leaf was found if (!defined($r_tree->{LEFT}) and !defined($r_tree->{RIGHT})) { $r_code->{$r_tree->{CHAR}} = $bitstring; $r_rev_code->{$bitstring} = $r_tree->{CHAR}; return; } # proceed recursively to children, remember the path in $bitstring # aux_set_char_encoding($r_tree->{RIGHT}, $r_code, $r_rev_code, $bitstring . "1"); aux_set_char_encoding($r_tree->{LEFT}, $r_code, $r_rev_code, $bitstring . "0"); } # Given a string and an encoding hash, returns the # encoded bit string # sub encode_string { my $string = $_[0]; my $r_code = $_[1]; my $encoded = ""; foreach (split(//, $string)) { $encoded .= $r_code->{$_}; } return $encoded; } # Given a bitstring and a decoding hash, returns the # decoded string. # # Huffman is a prefix code (no sequence is a prefix # of another sequence), hence it can always be uniquely # decoded. # sub decode_string { my $bitstring = $_[0]; my $r_rev_code = $_[1]; my $decoded = ""; my $temp = ""; foreach my $bit (split(//, $bitstring)) { # concat a new bit $temp .= $bit; # if the accumulated sequence fits some encoding, # decode it and empty the sequence if (defined $r_rev_code->{$temp}) { $decoded .= $r_rev_code->{$temp}; $temp = ""; } } return $decoded; } exit(main(@ARGV)); ## ## Driver code ## ## sub main { my $string = "anaconda"; my $r_dict = freq_count($string); my $r_forest = init_forest($r_dict); my $r_huff_tree = build_huffman_tree($r_forest); my ($r_code, $r_rev_code) = set_char_encoding($r_huff_tree); my $encoded_string = encode_string($string, $r_code); my $str_len_bits = length($string) * 8; print "Length in bits of original string: $str_len_bits\n"; print "Lenght in bits of encoded string: " . length($encoded_string) . "\n"; my $decoded = decode_string($encoded_string, $r_rev_code); return 0; }