#!/usr/bin/env perl
# wren ng thornton, hw5, 600.465 Eisner
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

use warnings;
use strict;

print "Usage: $0 trainingfile testfile\n\n" and exit 1
	unless 2 == @ARGV and -r $ARGV[0] and -r $ARGV[1];

$| = 1; # Autoflush for debugging purposes


#~~~~~ Read in the training file
# CountAB := counts of A-given-B (N.B. not A-to-B)
my (%CountT, %CountTT, %CountWT, %TagDict, 
	%CountW, %SingTT,  %SingWT,  @AllTags, $CountAllWords, $CountVocab); {
	
	open my $training, "<", $ARGV[0]
		or die "$0: Couldn't open training file: $!\n";
	
	die "$0: training file doesn't match specifications\n"
		unless <$training> eq "###/###\n"; # N.B. this does -1 on $CountAllWords
	
	my $prev_tag = '###';
	while (<$training>) { chomp; next unless $_; my ($word, $tag) = split '/';
		
		$CountT{$tag}              += 1;
		$CountW{$word}             += 1;
		
		$CountTT{"$tag/$prev_tag"} += 1;
		if      ($CountTT{"$tag/$prev_tag"} == 1) {
			$SingTT{$prev_tag}     += 1;
		} elsif ($CountTT{"$tag/$prev_tag"} == 2) {
			$SingTT{$prev_tag}     -= 1;
		}
		
		$CountWT{"$word/$tag"}     += 1;
		if      ($CountWT{"$word/$tag"} == 1) {
			push @{$TagDict{$word}}, $tag;
			$SingWT{$tag}          += 1;
		} elsif ($CountWT{"$word/$tag"} == 2) {
			$SingWT{$tag}          -= 1;
		}
		
		$prev_tag = $tag;
	}
	close $training;
	
	# Needed to predict for novel words, only calculating so as to not include '###'
	foreach my $tag (keys %CountT) {
		push @AllTags, $tag
			unless $tag eq '###';
	}
	
	# Needed for smoothing
	foreach my $word (keys %CountW) {
		$CountAllWords += $CountW{$word};
		$CountVocab    += 1;
	}
	$CountVocab += 1; # for OOV
}


#~~~~~ Read in the testing file
my (@Words, @RealTags); {
	open my $testing, "<", $ARGV[1]
		or die "$0: Couldn't open testing file: $!\n";
	while (<$testing>) { chomp; next unless $_; my ($word, $tag) = split '/';
		push @Words,    $word;
		push @RealTags, $tag;
	}
	close $testing;
}


# Using => instead of comma to make the conditional a bit more visually explicit
# N.B. these arguments are in reverse order from p(a|b) because of how => looks
sub ptt($$) { my ($ti_1 => $ti) = @_;
	my $mycount  = $CountTT{"$ti/$ti_1"} || 0;
	my $allcount = $CountT{$ti_1}        || die "$0: count of tag $ti_1 is zero!\n";
	my $lambda   = $SingTT{$ti_1}        || 1e-100; # avoid p==0 when $mycount and $lambda are 0
	my $backoff  = ($CountT{$ti}         || die "$0: count of tag $ti is zero!\n")
					/ $CountAllWords; # unsmoothed pt(ti)
	
	return ($mycount + $lambda * $backoff) / ($allcount + $lambda);
}


# N.B. still calling this "probability of word given tag", even though order of args swapped
sub pwt($$) { my ($ti => $wi) = @_;
	my $mycount  = $CountWT{"$wi/$ti"} || 0;
	my $allcount = $CountT{$ti}        || die "$0: count of tag $ti is zero!\n";
	my $lambda   = $SingWT{$ti}        || 1e-100; # avoid p==0 when $mycount and $lambda are 0
	my $backoff  = (($CountW{$wi} || 0) + 1) / ($CountAllWords + $CountVocab); # add-one pw(wi)
	
	return ($mycount + $lambda * $backoff) / ($allcount + $lambda);
}


# To enable prediction of tags for novel words
sub tag_dict($) { my ($word) = @_;
	if (exists $TagDict{$word}) {
		return @{$TagDict{$word}};
	} else {
		return @AllTags;
	}
}


use constant                # Defining separately so we can use it in the other defs
	'Inf'     => 6**6**6;   # More than enough to overflow to +inf
use constant {              # Multiple defs this requires a hashref not a list!	
	'NegInf'  => -Inf(),    # Apparently just `-Inf` is ambiguous... N.B. NegInf is true!
	'NaN'     => Inf - Inf, # N.B. NaN is true!
	'NegZero' => -0.0,      # Need explicit floating point zero to get it; N.B. -0.0 == 0 is true
};

use subs 'log'; # so we can override the built-in function (rather than using a different name)
sub log($) { my ($x) = @_;
	return $x == 0 ? NegInf : CORE::log $x;
}

#~~~~~ Run Viterbi over it
my (%Backpointer, $Perplexity); {
	my %lmu; $lmu{'###/0'} = log 1; # dunno why perl demands breaking the `my` up
	                                # mu is Viterbi approximation for alpha
	foreach my $i (1..@Words-1) {
		foreach my $ti (tag_dict $Words[$i]) {
			foreach my $ti_1 (tag_dict $Words[$i-1]) {
				
				if ((my $lmu = $lmu{"$ti_1/".($i-1)}) != NegInf) {
					$lmu += log ptt($ti_1 => $ti) * pwt($ti => $Words[$i]);
					
					if (not exists $lmu{"$ti/$i"} or $lmu > $lmu{"$ti/$i"}) {
						$lmu{"$ti/$i"}         = $lmu;
						$Backpointer{"$ti/$i"} = $ti_1;
					}
				} else {
					warn "* zero probability for word $Words[$i] as tag $ti after tag $ti_1\n";
				}
			}
		}
	}
	
	$Perplexity = exp (- $lmu{'###/'.(@Words-1)} / (@Words-1));
	warn "* Infinite perplexity due to zero probability\n"
		if $Perplexity == Inf;
}


#~~~~~ Generate the actual tag sequence
my (@Tags); {
	$Tags[@Words-1] = '###';
	foreach my $i (reverse 1..@Words-1) {
		$Tags[$i-1] = $Backpointer{"$Tags[$i]/$i"}; # Buglet: undef in (.) when p==0
	}
}


#~~~~~ Report accuracy et al
my (%correct, %count);
foreach my $i (0..@Tags-1) {
	unless ($RealTags[$i] eq '###') {
		my $match = $Tags[$i] eq $RealTags[$i]; # Buglet: undef in `eq` because of the buglet above
		
		$correct{'total'} += 1 if $match;
		$count{'total'}   += 1;
		
		if (exists $TagDict{$Words[$i]}) { # `exists` is safe because tag_dict() doesn't autovivify
			$correct{'known'} += 1 if $match;
			$count{'known'}   += 1;
		} else {
			$correct{'novel'} += 1 if $match;
			$count{'novel'}   += 1;
		}
	}
}

printf "Tagging accuracy: %.2f%% (known: %.2f%% novel: %.2f%%)\n"
	. "Perplexity per tagged test word: %.3f\n",
	($count{'total'} ? 100 * ($correct{'total'} || 0) / $count{'total'} : 0),
	($count{'known'} ? 100 * ($correct{'known'} || 0) / $count{'known'} : 0),
	($count{'novel'} ? 100 * ($correct{'novel'} || 0) / $count{'novel'} : 0),
	$Perplexity;
