#!/tools/cs/perl5/bin/perl #use strict; # tif # The idiom finder # # By David Mankin package tif; open OUTPUT, "> debuglog"; print OUTPUT "about to require files\n"; require 'conjtable_fr.pl'; print OUTPUT "successfully required conjtable_fr\n"; require 'conjtable_en.pl'; print OUTPUT "successfully required conjtable_en\n"; require 'translation.pl'; print OUTPUT "successfully required translation\n"; # Step 1, generate regular expressions from the idiom list. open IDIOMS, "idioms_full.txt" or die "Can't open idioms_full.txt: $!\n"; print OUTPUT "successfully opened file\n"; while () { chomp; my ($num, $en,$lit_fr, $lit_en, $fr) = split /\//, $_; print OUTPUT "English: $en\nFrench: $fr\nLit Eng: $lit_en\n"; my $en_regexp = &make_expr($en, 'en'); my $fr_regexp = &make_expr($fr, 'fr'); my $lit_en_regexp = &make_expr($lit_en, 'en'); push @en_input, $en; push @fr_input, $fr; push @lit_en_input, $lit_en; push @lit_fr_input, $lit_fr; push @en_exprs, $en_regexp; push @fr_exprs, $fr_regexp; push @lit_en_exprs, $lit_en_regexp; } &test_it2(); ### ## For each word, see if it is the start of a french expression. ## If it is, we save information off to the stack, and replace it ## so we don't try to substitue it again. ## The information we save is: ## The french source text {FR} ## The idiom number {NUM} ## The french idiom in canonical form {FR_C} ## The english idiom in canonical form {EN_C} ## The literal english translation in canonical form {LIT_EN_C} ## After the translation, we add this additional information ## The the actual literal english transation {LIT_EN} ## The information in LIT_EN and LIT_EN_C can be combined to produce ## the right form of EN_C to replace in the output. ## #perl -e 'print join ":", split /(?=\s+)/, "hello how are you? I am fine, but you\x27re not."' # Th sub fit { my ($string) = @_; $string =~ s/\n/ /sg; $string =~ s/\s\s+/ /g; #remove pairs+ of spaces # First, generate a list of where the start of every word is my (@words) = (); my (@idiom_stack) = (); my $string_en = &translation::translate_lang("fr_en", $string); $string =~ s/'(?=\S)/' /g; $string = lc $string; $string_en =~ s/\s\s+/ /g; $string_en = lc $string_en; my ($counter) = 0; foreach (split /(?=\s+)/, $string) { push @words, $counter; $counter += length $_; } push @words, $counter; #test it my $i; for ($i = 0 ; $i < @words ; $i++) { print OUTPUT substr $string, $words[$i], $words[$i+1] - $words[$i]; print OUTPUT "\n"; } my $idiom_num = 0; #now, for each word, we look for an idiom match WORD: for ($i = 0 ; $i < @words ; ) { my (%results) = (); print OUTPUT ("Evaluating string: " . substr($string, $words[$i]). "\n"); my $j; for ($j = 0 ; $j < @fr_exprs ; $j++) { print OUTPUT (" Evaluating expr $fr_exprs[$j]\n"); my (@parts); if (@parts = (substr($string, $words[$i]) =~ /^($fr_exprs[$j])/i)) { print OUTPUT "Found!!!!!!!!!!!!!\nParts:\n"; print OUTPUT join "\n", @parts; print OUTPUT "\n"; $results{FR} = $1; $results{FR} =~ s/^ //; @l = split(/ /,$results{FR}); shift @l while ($l[0] =~ /^\s*$/); print OUTPUT "IDIOM LENGTH = " . scalar @l . "\n"; $results{NUM} = $j; $results{FR_C} = $fr_input[$j]; $results{EN_C} = $en_input[$j]; $results{LIT_EN_C} = $lit_en_input[$j]; my $k; my (@idiomparts) = &get_idiom_parts ($results{FR_C}); print OUTPUT ("Idiomparts: ", join ", ", @idiomparts); print OUTPUT "\n"; for ($k = 0; $k < @idiomparts ; $k++ ) { if ($idiomparts[$k] =~ /^\{/) { # It's a verb! my $tpns = $conjtable_fr::lookup_table{lc $parts[$k+1]}; my $k1 = $k + 1; print OUTPUT ("TPNs for $parts[$k1] = $tpns\n"); $results{"FR_TPN_$k1"} = $tpns; } } print OUTPUT "Idiom matched; results: \n"; print OUTPUT join ",", %results; print OUTPUT "\n"; $idiom_num++; $i += @l; push @idiom_stack, \%results; next WORD; } } $i++; } print OUTPUT "IDIOM STACK: ------------------------------>>>>>>>>>>>\n"; print OUTPUT join ", ", @idiom_stack; print OUTPUT "\n"; my (@words_en) = (); ($counter) = 0; foreach (split /(?=\s+)/, $string_en) { push @words_en, $counter; $counter += length $_; } push @words_en, $counter; #test it my $i; for ($i = 0 ; $i < @words_en ; $i++) { print OUTPUT substr $string_en, $words_en[$i], $words_en[$i+1] - $words_en[$i]; print OUTPUT "\n"; } $idiom_num = 0; my $output = ""; #now, for each word, we look for an idiom match WORD: for ($i = 0 ; $i < @words_en ; ) { my ($includeperiod) = 0; if ($just_found_one) { $position_memory = $i; $output_memory = $output; } print OUTPUT ("Evaluating string: " . substr($string_en, $words_en[$i]). "\n"); my $j; $fr_results = $idiom_stack[$idiom_num]; my ($lit_en_expr) = $lit_en_exprs[$fr_results->{NUM}]; print OUTPUT (" Evaluating expr $lit_en_expr\n"); my (@parts); if (@parts = (substr($string_en, $words_en[$i]) =~ /^($lit_en_expr)/i)) { print OUTPUT "FOUND!!!!!!!!!!\n"; $just_found_one = 1; $fr_results->{LIT_EN} = $1; $includeperiod = 1 if $' =~ /\s*\./ ; print OUTPUT "... setting \$includeperiod\n" if ($includeperiod); my (@l) = split(/ /,$fr_results->{LIT_EN}); shift @l while ($l[0] =~ /^\s*$/); print OUTPUT "IDIOM LENGTH = " . scalar @l . "\n"; my $k; my (@idiomparts) = &get_idiom_parts ($fr_results->{LIT_EN_C}); print OUTPUT ("Idiomparts: ", join ", ", @idiomparts); print OUTPUT "\n"; for ($k = 0; $k < @idiomparts ; $k++ ) { if ($idiomparts[$k] =~ /^\{/) { # It's a verb! my $k1 = $k + 1; my $lit_en_conj_verb = $parts[$k1] ; # is the verb conjugated my $lit_en_inf_verb = substr($idiomparts[$k],1) ; # is the verb inf $lit_en_inf_verb =~ s/^\s+//g; $lit_en_inf_verb =~ s/\s+$//g; $lit_en_inf_verb =~ lc $lit_en_inf_verb; my $matchedtpn = ""; my $tpns = $fr_results->{"FR_TPN_$k1"}; TPN_SEARCH: foreach (split /,/, $tpns) { print OUTPUT "Examining french TPN: $_\n"; s/^(.*)-[^-]*$/$1/; $eng_tpns = $conjtable_en::map_fr{$_}; foreach $current_tpn (split /,/, $eng_tpns) { print OUTPUT "--> maps to english TPN: $current_tpn\n"; my $conjedform = $conjtable_en::conjugations{"$current_tpn-$lit_en_inf_verb"}; $conjedform =~ s/^\s+//g; $conjedform =~ s/\s+$//g; print OUTPUT " conjugated form of [$lit_en_inf_verb] is [$conjedform]\n"; if (lc $conjedform eq lc $lit_en_conj_verb) { print OUTPUT " ..matches!\n"; $matchedtpn = $current_tpn; last TPN_SEARCH; } } } if ($matchedtpn eq "") { print OUTPUT "WARNING!!! No Matching TPN!!! for $lit_en_conj_verb\n"; #Now, since we didnt' find it from the french tpn, we look at #the list of english tpns by form to find a TPN. my $matchedtpns = $conjtable_en::lookup_table{$lit_en_conj_verb}; print OUTPUT " ==> Looked in lookup_table and found $matchedtpns\n"; $matchedtpn = $matchedtpns if $matchedtpns !~ /,/; ($matchedtpn) = $matchedtpns =~ /^(.*?),/ if $matchedtpns =~ /,/; $matchedtpn =~ s/^(.*)-.*?$/$1/; print OUTPUT "Matched: $tpns -> [$matchedtpn] (verb=$lit_en_inf_verb)\n"; } else { print OUTPUT "Matched: $tpns -> $matchedtpn (verb=$lit_en_inf_verb)\n"; } print OUTPUT ("TPNs for $parts[$k1] = $matchedtpn\n"); $fr_results->{"EN_TPN_$k1"} = $matchedtpn; } } $idiom_num++; $i += @l; print OUTPUT "Idiom matched; results: \n"; print OUTPUT join ",", %$fr_results; print OUTPUT "\n"; ## Now we match up the literal and the idiomatic to conjugate the ## idiomatic correctly. (later) my $EN_CONJ = $fr_results->{EN_C}; &do_conjugation($fr_results); $EN_CONJ = $fr_results->{EN}; # output the idiomatic into $output print OUTPUT "Appending [$EN_CONJ] to \$output\n"; $output .= " " . $EN_CONJ; if ($includeperiod) { $ouptut .= "."; } next WORD; } $output .= substr($string_en, $words_en[$i], $words_en[$i+1]-$words_en[$i]); $i++; $just_found_one = 0; if ($i >= @words_en && $idiom_num < $#idiom_stack ) { $i = $position_memory; $output = $output_memory; $idiom_num ++; print OUTPUT "NOTE NOTE NOTE!!! Aborting looking for previous one\n" } } print OUTPUT "OUTPUT!!!!!!!!!\n\t$output\n"; return $output; ## NOTE TO SELVES: What to do next: # get Conjugated english expressions from patrick's code - done! # Conjugate english. *****!!!!!***!*!*!***!!!*!!**** # figure out what's wrong with the missing "deja". } # Step 2 -- match the french idioms. # sub fit_old { # my ($string) = @_; # # my $i ; # for ($i = 0 ; $i < @fr_exprs ; $i++) { # if ($string =~ /$fr_exprs[$i]/i) { # my $new_string = &idiom_replace($string, $i); # return $new_string; # } # } # } sub idiom_replace { my ($string, $i) = @_; print OUTPUT "idiom_replace: $string [$i]\n"; $string; #my ($en_marked, $fr_marked, $lit_en_marked) = &get_marked($string, $i); # TODO : more here } sub get_makred { my ($fr, $i) = @_; my ($lit_en) = &translation::translate($fr); my ($lit_en_marked) = &mark_it($lit_en, $i, "lit_en"); my ($fr_marked) = &mark_it($fr, $i, "fr"); my ($en_marked) = &mark_it($fr, $i, "en"); return ($en_marked, $fr_marked, $lit_en_marked); } sub mark_it{ my($str, $i, $lang) = @_; my ($before, @actual_parts) = $str =~ /^(.*?)${$lang}_exprs[$i]/; my ($after) = $'; my (@expected_parts) = split /[])}]\s*/, ${"$lang_input"}[$i]; my $j; for ($j = 0 ; $j < @expected_parts ; $j++) { } } sub get_idiom_parts { my ($text) = @_; my (@results); print OUTPUT "get_idiom_parts! TEXT=$text\n"; while ($text =~ /\s*([[{(<])(\d*)/) { # matches opening brace & number my $left = $1; my $right = $left; $right =~ tr/[{(/; print OUTPUT "-debug: left=$left; right=$right\n"; my $found = $text =~ s/^\s*(\Q$left\E)(\d)*\s*(.*?)\s*\Q$right\E\s*//i; my ($type, $num, $content) = ($1, $2, $3); push @results, "$type$content"; } return @results; } ## # This routine takes an idiom.txt-format description and # makes a regular expression out of it. ## sub make_expr { my ($text, $lang) = @_; my ($expr) = '\s*'; while ($text =~ /\s*([[{(<])(\d*)/) { # matches opening brace & number my $left = $1; my $right = $left; $right =~ tr/[{(/; print OUTPUT "-debug: left=$left; right=$right\n"; my $found = $text =~ s/^\s*\Q$left\E(\d)*\s*(.*?)\s*\Q$right\E\s*//i; my ($num, $content) = ($1, $2); print OUTPUT "-debug: num = $num; content=$content; text=$text; found=$found\n"; # handle verbs specially here if ($left eq "{") { $expndcode = '$conjtable_' . $lang . "::conjugations{'$content'}"; #print OUTPUT "About to eval: $expndcode\n"; $expand = eval $expndcode; #print OUTPUT "****Expansion of [$content] in [$lang] -> [$expand]\n"; if (defined $expand && $lang eq 'fr') { # do we want to do this?? #$expand = "(?!(?!s|j|c|m|t|l)'\s*)?($expand)"; #add parens around xxx yyy|zzz aaa to make (xxx yyy)|(zzz aaa) #my $newexpand = ""; #$newexpand = join (")|(", (split /\|/, $expand) ); #$newexpand = "($newexpand)"; #$expand = $newexpand; } $content = $expand if defined $expand; } # handle nouns specially here #TODO $expr .= "($content)" . '(?:\b|\s+)'; #if ($lang eq 'fr') { #$expr = "(?:'|\s+)$expr"; #} } $expr; } # uses the EN_C variable and conjugates it and stores it in EN sub do_conjugation{ my ($results) = @_; my $EN=""; my $k; my (@idiomparts) = &get_idiom_parts ($results->{EN_C}); print OUTPUT ("Idiomparts: ", join ", ", @idiomparts); print OUTPUT "\n"; for ($k = 0; $k < @idiomparts ; $k++ ) { my $k1 = $k +1; if ($idiomparts[$k] =~ /^\{/) { # It's a verb! # conjugate it right; my $verb_inf = substr $idiomparts[$k], 1; my $tpn = $results->{"EN_TPN_$k1"}; my $conjform = $conjtable_en::conjugations{"$tpn-$verb_inf"}; $EN .= "$conjform "; } else { $EN .= substr $idiomparts[$k], 1; } } $results->{EN} = $EN; } sub test_it { print OUTPUT "english expressions:\n" , join("\n",@en_exprs), "\n\n"; print OUTPUT "french expressions:\n" , join("\n",@fr_exprs), "\n\n"; print OUTPUT "eng_lit expressions:\n" , join("\n",@lit_en_exprs), "\n\n"; } sub test_it2 { my $string; $string = "L'homme a mang\351 le morceau deja. L'homme a mang\351 le morceau deja."; #$string = "L'homme a d\373 manger le morceau deja."; #my $string = "L'homme a d\373 manger le morceau deja. "; #my $string = "Les hommes mangerons le morceau deja. "; #my $string = "Les hommes a le morceau deja. "; #my $string = "Les hommes mang\350rent le morceau deja. "; #my $string = "L'homme ai mang\351 le morceau de ja."; $stringz = "un coup de fil deja, et a propos."; $string = < \n$str2\n"; } sub test_it3 { open INPUT, "idioms_french.txt" || die "error:$!"; my $string = join "", ; my $str2 = fit($string); print OUTPUT "$string -> \n$str2\n"; } 1;