#!/usr/bin/perl # Given a word list file in stdin, and the name of a file to be used # to map letters in the word list to values 0..31, translate the list # using the mapping, sort it by the new values, and finally build and # emit to files a dawg. # This is meant eventually to be runnable as part of a cgi system for # letting users generating Crosswords dicts online. ############################################################################## # adapted from C++ code Copyright (C) 2000 Falk Hueffner # This version Copyright (C) 2002 Eric House # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA ############################################################################## use strict; my $gFirstDiff; my @gCurrentWord; my $gDone = 0; my @gNodes; # final array of nodes my $gNumNodes = 0; my $debug = 0; main(); sub main { # Do I need this stupid thing? Better to move the first row to # the front of the array and patch everything else. Or fix the # non-palm dictionary format to include the offset of the first # node. $gNodes[$gNumNodes++] = MakeTrieNode('*', 0, 1, 1); readNextWord(); my $firstRootChildOffset = buildNode(0); TrieNodeSetFirstChildOffset( \$gNodes[0], $firstRootChildOffset ); printNodes( \@gNodes, "done with main" ) if $debug; } sub buildNode { my ( $depth ) = @_; if ( @gCurrentWord == $depth ) { # End of word reached. If the next word isn't a continuation # of the current one, then we've reached the bottom of the # recursion tree. readNextWord(); if ($gFirstDiff < $depth) { return 0; } } my @newedges; do { my $letter = @gCurrentWord[$depth]; my $isTerminal = @gCurrentWord - 1 == $depth ? 1:0; my $nodeOffset = buildNode($depth+1); my $newNode = MakeTrieNode($letter, $isTerminal, $nodeOffset); push( @newedges, $newNode ); } while ( ($gFirstDiff == $depth) && !$gDone); TrieNodeSetIsLastSibling( \@newedges[@newedges-1], 1 ); return addNodes( \@newedges ); } # buildNode sub addNodes { my ( $newedgesR ) = @_; my $found = findSubArray( $newedgesR ); if ( $found >= 0 ) { return $found; } else { my $firstFreeIndex = @gNodes; $gNumNodes += @{$newedgesR}; print "adding...\n" if $debug; printNodes( $newedgesR ) if $debug; push @gNodes, (@{$newedgesR}); registerSubArray( ${$newedgesR}[0], $firstFreeIndex ); return $firstFreeIndex; } } # addNodes sub printNode { my ( $index, $node ) = @_; print STDERR "[$index] "; printf( STDERR "letter=%s; isTerminal=%d; isLastSib=%d; fco=%d;\n", chr(TrieNodeGetLetter($node)), TrieNodeGetIsTerminal($node), TrieNodeGetIsLastSibling($node), TrieNodeGetFirstChildOffset($node)); } # printNode sub printNodes { my ( $nodesR, $name ) = @_; my $len = @{$nodesR}; # print "printNodes($name): len = $len\n"; for ( my $i = 0; $i < $len; ++$i ) { my $node = ${$nodesR}[$i]; printNode( $i, $node ); } } # Hashing. We'll keep a hash of arrays. The keys will be stringified # representations of the first node in a subarray. The values will be # arrays of the indices at which the subarrays start. So when we want # to look for a match for an array of newEdges, we use the first node # in the array as a key and get back a list of all the indices at # which arrays begin with identical nodes. We then loop through the # array, starting a strcmp from each index. Find a match for the full # array and we have a match. Otherwise just return -1. my %gSubsHash; sub findSubArray { my ( $newedgesR ) = @_; my $key = ${$newedgesR}[0] . ""; # make a string of it if ( exists( $gSubsHash{$key} ) ) { my $bucketR = $gSubsHash{$key}; my $len = @{$newedgesR}; OUTER: foreach my $nodeLoc (@{$bucketR}) { for ( my $i = 0; $i < $len; ++$i ) { if ( $gNodes[$i+$nodeLoc] != ${$newedgesR}[$i] ) { next OUTER; } } # found it!!! return $nodeLoc; } } return -1; } # findSubArray # add to the hash sub registerSubArray { my ( $key, $nodeLoc ) = @_; $key .= ""; if ( exists $gSubsHash{$key} ) { my $bucket = $gSubsHash{$key}; push( @{$bucket}, $nodeLoc ); } else { my @arr = $nodeLoc; $gSubsHash{$key} = \@arr; } } # registerSubArray sub readNextWord { my @nextWord = {}; for ( ; ; ) { $_ = <>; if ( !$_ ) { $gDone = 1; return; } chomp; print STDERR "readNextWord: got $_\n" if $debug; @nextWord = split(//); if ( @nextWord ) { last; } } my $numCommonLetters = 0; my $len = @nextWord; if ( @gCurrentWord < $len ) { $len = @gCurrentWord; } while ( @gCurrentWord[$numCommonLetters] eq @nextWord[$numCommonLetters] && $numCommonLetters < $len) { ++$numCommonLetters; } $gFirstDiff = $numCommonLetters; @gCurrentWord = @nextWord; } # readNextWord # Print binary representation of trie array. This isn't used yet, but # eventually it'll want to dump to multiple files appropriate for Palm # that can be catenated together on other platforms. There'll need to # be a file giving the offset of the first node too. Also, might want # to move to 4-byte representation when the input can't otherwise be # handled. sub dumpNodes { for ( my $i = 0; $i < @gNodes; ++$i ) { my $node = $gNodes[$i]; my $bstr = pack( "I", $node ); print STDOUT $bstr; } } ############################################################################## # Little node-field setters and getters to hide what bits represent # what. ############################################################################## sub TrieNodeSetIsTerminal { my ( $nodeR, $isTerminal ) = @_; if ( $isTerminal ) { ${$nodeR} |= 1 << 31; } else { ${$nodeR} &= ~(1 << 31); } } sub TrieNodeGetIsTerminal { my ( $node ) = @_; return ($node & 1 << 31) != 0; } sub TrieNodeSetIsLastSibling { my ( $nodeR, $isLastSibling ) = @_; if ( $isLastSibling ) { ${$nodeR} |= 1 << 30; } else { ${$nodeR} &= ~(1 << 30); } } sub TrieNodeGetIsLastSibling { my ( $node ) = @_; return ($node & 1 << 30) != 0; } sub TrieNodeSetLetter { my ( $nodeR, $letter ) = @_; my $mask = ~(0xFF << 22); ${$nodeR} &= $mask; # clear all the bits ${$nodeR} |= (ord($letter) << 22); # set new ones } sub TrieNodeGetLetter { my ( $node ) = @_; $node >>= 22; $node &= 0xFF; return $node; } sub TrieNodeSetFirstChildOffset { my ( $nodeR, $fco ) = @_; my $mask = ~0x003FFFFF; ${$nodeR} &= $mask; # clear all the bits ${$nodeR} |= $fco; # set new ones } sub TrieNodeGetFirstChildOffset { my ( $node ) = @_; $node &= 0x003FFFFF; # 22 bits return $node; } sub MakeTrieNode { my ( $letter, $isTerminal, $firstChildOffset, $isLastSibling ) = @_; my $result = 0; TrieNodeSetIsTerminal( \$result, $isTerminal ); TrieNodeSetIsLastSibling( \$result, $isLastSibling ); TrieNodeSetLetter( \$result, $letter ); TrieNodeSetFirstChildOffset( \$result, $firstChildOffset ); return $result; } # MakeTrieNode