#!/usr/bin/perl

# Given a word list file in stdin, and the name of a file to be used
# to map letters in the word list to values 0..31, translate the list
# using the mapping, sort it by the new values, and finally build and
# emit to files a dawg.

# This is meant eventually to be runnable as part of a cgi system for
# letting users generating Crosswords dicts online.

##############################################################################
# adapted from C++ code Copyright (C) 2000 Falk Hueffner
# This version Copyright (C) 2002 Eric House
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
##############################################################################


use strict;

my $gFirstDiff;
my @gCurrentWord;
my $gDone = 0;
my @gNodes;    			# final array of nodes
my $gNumNodes = 0;
my $debug = 0;

main();

sub main { 

	# Do I need this stupid thing?  Better to move the first row to
	# the front of the array and patch everything else.  Or fix the
	# non-palm dictionary format to include the offset of the first
	# node.
    $gNodes[$gNumNodes++] = MakeTrieNode('*', 0, 1, 1);

    readNextWord();

    my $firstRootChildOffset = buildNode(0);
    TrieNodeSetFirstChildOffset( \$gNodes[0], $firstRootChildOffset );

	printNodes( \@gNodes, "done with main" ) if $debug;
}


sub buildNode {
    my ( $depth ) = @_;

    if ( @gCurrentWord == $depth ) {
	# End of word reached. If the next word isn't a continuation
	# of the current one, then we've reached the bottom of the
	# recursion tree.
	readNextWord();
	if ($gFirstDiff < $depth) {
	    return 0;
	}
    }

    my @newedges;

    do {
		my $letter = @gCurrentWord[$depth];
		my $isTerminal = @gCurrentWord - 1 == $depth ? 1:0;

		my $nodeOffset = buildNode($depth+1);
		my $newNode = MakeTrieNode($letter, $isTerminal, $nodeOffset);
		push( @newedges, $newNode );

    } while ( ($gFirstDiff == $depth) && !$gDone);

	TrieNodeSetIsLastSibling( \@newedges[@newedges-1], 1 );

    return addNodes( \@newedges );
} # buildNode

sub addNodes {
    my ( $newedgesR ) = @_;

    my $found = findSubArray( $newedgesR );

    if ( $found >= 0 ) {
		return $found;
    } else {

		my $firstFreeIndex = @gNodes;
		$gNumNodes += @{$newedgesR};

		print "adding...\n" if $debug;
		printNodes( $newedgesR ) if $debug;

		push @gNodes, (@{$newedgesR});

		registerSubArray( ${$newedgesR}[0], $firstFreeIndex );
		return $firstFreeIndex;
	}
} # addNodes

sub printNode {
    my ( $index, $node ) = @_;

	print STDERR "[$index] ";

	printf( STDERR
			"letter=%s; isTerminal=%d; isLastSib=%d; fco=%d;\n",
			chr(TrieNodeGetLetter($node)),
			TrieNodeGetIsTerminal($node),
			TrieNodeGetIsLastSibling($node),
			TrieNodeGetFirstChildOffset($node));
} # printNode

sub printNodes {
    my ( $nodesR, $name ) = @_;

    my $len = @{$nodesR};
    # print "printNodes($name): len = $len\n";

    for ( my $i = 0; $i < $len; ++$i ) {
		my $node = ${$nodesR}[$i];
		printNode( $i, $node );
    }

}


# Hashing.  We'll keep a hash of arrays.  The keys will be stringified
# representations of the first node in a subarray.  The values will be
# arrays of the indices at which the subarrays start.  So when we want
# to look for a match for an array of newEdges, we use the first node
# in the array as a key and get back a list of all the indices at
# which arrays begin with identical nodes.  We then loop through the
# array, starting a strcmp from each index.  Find a match for the full
# array and we have a match.  Otherwise just return -1.

my %gSubsHash;

sub findSubArray {
    my ( $newedgesR ) = @_;

	my $key = ${$newedgesR}[0] . ""; # make a string of it

	if ( exists( $gSubsHash{$key} ) ) {

		my $bucketR = $gSubsHash{$key};
		my $len = @{$newedgesR};

	  OUTER:
		foreach my $nodeLoc (@{$bucketR}) {
		
			for ( my $i = 0; $i < $len; ++$i ) {
				if ( $gNodes[$i+$nodeLoc] != ${$newedgesR}[$i] ) {
					next OUTER;
				}
			}
			# found it!!!
			return $nodeLoc;
		}
	}

	return -1;
} # findSubArray

# add to the hash
sub registerSubArray {
	my ( $key, $nodeLoc ) = @_;

	$key .= "";

	if ( exists $gSubsHash{$key} ) {
		my $bucket = $gSubsHash{$key};
		push( @{$bucket}, $nodeLoc );
	} else {
		my @arr = $nodeLoc;
		$gSubsHash{$key} = \@arr;
	}

} # registerSubArray

sub readNextWord {
    my @nextWord = {};
    
    for ( ; ; ) {

	$_ = <>;

	if ( !$_ ) {
	    $gDone = 1;
	    return;
	}

	chomp;
 	print STDERR "readNextWord: got $_\n" if $debug;

	@nextWord = split(//);

	if ( @nextWord ) {
	    last;
	}
    }

    my $numCommonLetters = 0;
    my $len = @nextWord;
    if ( @gCurrentWord < $len ) {
	$len = @gCurrentWord;
    }

    while ( @gCurrentWord[$numCommonLetters] eq @nextWord[$numCommonLetters]
	    && $numCommonLetters < $len) {
 	++$numCommonLetters;
    }

    $gFirstDiff = $numCommonLetters;
    @gCurrentWord = @nextWord;
} # readNextWord

# Print binary representation of trie array.  This isn't used yet, but
# eventually it'll want to dump to multiple files appropriate for Palm
# that can be catenated together on other platforms.  There'll need to
# be a file giving the offset of the first node too.  Also, might want
# to move to 4-byte representation when the input can't otherwise be
# handled.

sub dumpNodes {

    for ( my $i = 0; $i < @gNodes; ++$i ) {
        my $node = $gNodes[$i];
		my $bstr = pack( "I", $node );
		print STDOUT $bstr;
    }
}

##############################################################################
# Little node-field setters and getters to hide what bits represent
# what.
##############################################################################

sub TrieNodeSetIsTerminal {
	my ( $nodeR, $isTerminal ) = @_;

	if ( $isTerminal ) {
		${$nodeR} |= 1 << 31;
	} else {
		${$nodeR} &= ~(1 << 31);
	}
}

sub TrieNodeGetIsTerminal {
	my ( $node ) = @_;
	return ($node & 1 << 31) != 0;
}

sub TrieNodeSetIsLastSibling {
	my ( $nodeR, $isLastSibling ) = @_;
	if ( $isLastSibling ) {
		${$nodeR} |= 1 << 30;
	} else {
		${$nodeR} &= ~(1 << 30);
	}
}

sub TrieNodeGetIsLastSibling {
	my ( $node ) = @_;
	return ($node & 1 << 30) != 0;
}

sub TrieNodeSetLetter {
	my ( $nodeR, $letter ) = @_;

	my $mask = ~(0xFF << 22);
	${$nodeR} &= $mask;			            # clear all the bits
	${$nodeR} |= (ord($letter) << 22);		# set new ones
}

sub TrieNodeGetLetter {
	my ( $node ) = @_;
	$node >>= 22;
	$node &= 0xFF;
	return $node;
}

sub TrieNodeSetFirstChildOffset {
	my ( $nodeR, $fco ) = @_;
	my $mask = ~0x003FFFFF;
	${$nodeR} &= $mask;			# clear all the bits
	${$nodeR} |= $fco;	        # set new ones
}

sub TrieNodeGetFirstChildOffset {
	my ( $node ) = @_;
	$node &= 0x003FFFFF;		# 22 bits
	return $node;
}


sub MakeTrieNode {
    my ( $letter, $isTerminal, $firstChildOffset, $isLastSibling ) = @_;
    my $result = 0;

	TrieNodeSetIsTerminal( \$result, $isTerminal );
	TrieNodeSetIsLastSibling( \$result, $isLastSibling );
	TrieNodeSetLetter( \$result, $letter );
	TrieNodeSetFirstChildOffset( \$result, $firstChildOffset );

    return $result;
} # MakeTrieNode