# # file: translit2koi8.pm # purpose: perl module for converting russian translit encoding to koi8-r # created: pasha nov 3 2000 # modified: pasha may 1 2008 # modification: one more pattern for e_obortonoe # synopsis: use translit2koi8; # $koi8_string = translit2koi8 ($translit_string); # pending: 1. exceptions from few-letters-patterns ('shvatil') ? # 2. maybe this should be optimized for speed - for now # we didn't care about it much... # 3. make it oo or something - in the present form it's ugly # package translit2koi8; use Exporter(); @ISA = qw(Exporter); @EXPORT = qw(translit2koi8 translit2win koi8_cmp); use strict; ######################### data for convertion ####################### # koi8 use constant CONV_KOI8 => { 'e_oborotnoe' => 'DC', # well, that's ugly, but sometimes we have no other # choice than to specify it literally, # as both words, with "e" and "e_oborotnoe", are valid # (like "ser") 'shch' => 'DD', 'i\'' => 'CA', # "i kratkoe" 'kh' => 'C8', # "ha" (first variant) 'zh' => 'D6', # "zhe" (first variant) 'ch' => 'DE', 'sh' => 'DB', 'yu' => 'C0', 'ya' => 'D1', 'yo' => 'A3', # "yo" (e with two dots) 'a' => 'C1', 'b' => 'C2', 'v' => 'D7', 'g' => 'C7', 'd' => 'C4', 'e' => 'C5', 'j' => 'D6', # "zhe" (second variant) 'z' => 'DA', # "ze" 'i' => 'C9', 'k' => 'CB', 'l' => 'CC', 'm' => 'CD', 'n' => 'CE', 'o' => 'CF', 'p' => 'D0', 'r' => 'D2', 's' => 'D3', 't' => 'D4', 'u' => 'D5', 'f' => 'C6', 'h' => 'C8', # "ha" (second variant) 'c' => 'C3', # "tse" 'y' => 'D9', '\'' => 'D8', # "myagkii znak" '"' => 'DF' # "tverdyi znak" }; # windows use constant CONV_WIN => { 'e_oborotnoe' => 'FD', 'shch' => 'F9', 'i\'' => 'E9', # "i kratkoe" 'kh' => 'F5', # "ha" (first variant) 'zh' => 'E6', # "zhe" (first variant) 'ch' => 'F7', 'sh' => 'F8', 'yu' => 'FE', 'ya' => 'FF', 'a' => 'E0', 'b' => 'E1', 'v' => 'E2', 'g' => 'E3', 'd' => 'E4', 'e' => 'E5', 'j' => 'E6', # "zhe" (second variant) 'z' => 'E7', # "ze" 'i' => 'E8', 'k' => 'EA', 'l' => 'EB', 'm' => 'EC', 'n' => 'ED', 'o' => 'EE', 'p' => 'EF', 'r' => 'F0', 's' => 'F1', 't' => 'F2', 'u' => 'F3', 'f' => 'F4', 'h' => 'F5', # "ha" (second variant) 'c' => 'F6', # "tse" 'y' => 'FB', '\'' => 'FC', # "myagkii znak" '"' => 'FA' # "tverdyi znak" }; # replace first occurence of "e" to "e oborotnoe" # in words beginning with the following strings use constant E_OBOROTNOE_PATTERNS => [qw(Daniel ebonit edak edgar edip effekt efir egoi eho ekho ei'lat ei'nshtei'n ekran ekscentri ekscess ekshn eksklyuziv ekskurs ekspedici eksperiment ekspluat ekspo ekspress ekstrapol ekvivalent ekzamen ekzemplyar ekzoti eleg elek element elit ellip emanaci emmiter engel energ entomolog entropi entuzia engel's emoci enciklopedi epataj epatazh epichesk epidem epigraf epikur epilog epizod epoh epokh epolet epopeya epos epsilon ergodichesk eridana ermitazh ernest eroti erudi eshelon espan'olk eskhil eskiz espinoza esse estet eston estrad etaj etak eta eti etnichesk etnograf etnos eto etu etyud evaku evolyuc ezoter kanoe Keler Kerrol kinoekspedic kinoeskiz kvintessenciya mnogoetazh odnoetazh poeksperimentiro poeta poetessa poeti poetomu poetu poezi poeznost proeks Rafael siluet Uel)]; ########################## end of data for convertion ##################### # shift between small and capital letters: use constant HEXCAPSHIFT_KOI8 => 0x20; # in koi8-r use constant HEXCAPSHIFT_WIN => -0x20; # in win ########################## "e oborotnoe"-related constants ################ use constant E_OBOROTNOE_INT_KOI8 => hex(CONV_KOI8->{'e_oborotnoe'}); use constant E_OBOROTNOE_KOI8 => chr(E_OBOROTNOE_INT_KOI8); use constant E_OBOROTNOE_CAPITAL_KOI8 => chr(E_OBOROTNOE_INT_KOI8 + HEXCAPSHIFT_KOI8); use constant E_OBOROTNOE_INT_WIN => hex(CONV_WIN->{'e_oborotnoe'}); use constant E_OBOROTNOE_WIN => chr(E_OBOROTNOE_INT_WIN); use constant E_OBOROTNOE_CAPITAL_WIN => chr(E_OBOROTNOE_INT_WIN + HEXCAPSHIFT_WIN); # split "e oborotnoe" patterns around the first occurence of "e" my @E = (); my ($before, $after); for (@{(E_OBOROTNOE_PATTERNS)}) { ($before, $after) = split (/e/, $_, 2); push (@E, {'before' => $before, 'after' => $after}); } ########################## end of "e oborotnoe"-related constants ######### ################### russian (ordered) alphabet in translit ################ # currently used only in koi8_cmp() use constant ALPHABET => [qw(a b v g d e j z i i' k l m n o p r s t u f h c ch sh shch " y ' e_oborotnoe yu ya)]; use constant NUM_IN_ALPHABET => scalar @{ALPHABET()}; my %ALPHABET_KOI8 = (); # key - koi8 letter (both capital and smalls) # value - it's order number in alphabet my %ALPHABET_WIN = (); my $counter = 0; my $hex; for (@{ALPHABET()}) { ++$counter; $hex = hex (CONV_KOI8->{$_}); $ALPHABET_KOI8{chr($hex)} = $counter + NUM_IN_ALPHABET; # small letters $ALPHABET_KOI8{chr($hex + HEXCAPSHIFT_KOI8)} = $counter; # capital letters $hex = hex (CONV_WIN->{$_}); $ALPHABET_WIN{chr($hex)} = $counter + NUM_IN_ALPHABET; # small letters $ALPHABET_WIN{chr($hex + HEXCAPSHIFT_WIN)} = $counter; # capital letters } ################### end of composing russian alphabet ##################### # wrappers around the main function sub translit2koi8 ($) { return (_translit2 ($_[0], E_OBOROTNOE_KOI8, E_OBOROTNOE_CAPITAL_KOI8, CONV_KOI8, HEXCAPSHIFT_KOI8)); } sub translit2win ($) { return (_translit2 ($_[0], E_OBOROTNOE_WIN, E_OBOROTNOE_CAPITAL_WIN, CONV_WIN, HEXCAPSHIFT_WIN)); } # main function which does the job: # transform given translit string to koi8 string # args: 0 - string to transform, # 1 - E_OBOROTNOE # 2 - E_OBOROTNOE_CAPITAL # 3 - ref to convertion hash # 4 - hexcapshift value sub _translit2 ($$$$) { my ($input, $e_oborotnoe, $e_oborotnoe_capital, $conv, $hexcapshift) = @_; return ('') if (! defined ($input)); # special case: turning "e" to "e oborotnoe" my ($before, $Before, $BEFORE, $after, $AFTER); for my $e (@E) { $before = $e->{'before'}; $Before = ucfirst ($before); $BEFORE = uc ($before); $after = $e->{'after'}; $AFTER = uc ($after); for ($input) { s/\b($before|$Before)e$after/$1 . $e_oborotnoe . $after/eg; s/\b${BEFORE}E($after|$AFTER)/$BEFORE . $e_oborotnoe_capital . $1/eg; } } # mark quotes and double quotes around words # (in order not to replace them by corresponding koi-8 characters) for ($input) { s/([\s\(])"([a-zA-Z0-9_\-',\.:!?\ ]*?)"([\s,\.:!\)]|$)/$1__^^__$2__^^__$3/g; s/([\s\(])'([a-zA-Z0-9_\-",\.:!?\ ]*?)'([\s,\.:!\)]|$)/$1__^__$2__^__$3/g; } # process "conversion hash" in descending order by key length my ($uc, $ucfirst, $hex); for my $from (sort { length($b) <=> length($a) } keys %{$conv}) { $uc = uc ($from); $ucfirst = ucfirst ($from); $hex = hex ($conv->{$from}); for ($input) { s/$from/chr($hex)/eg; s/($uc|$ucfirst)/chr($hex + $hexcapshift)/eg; } } # restore quotes and double quotes around words for ($input) { s/__\^__/'/g; s/__\^\^__/"/g; } # remove separation sign '^' # we use it to separate between letters # (like sh in 'shvatil', where it should be two separated # letters "s" and "h", and not one "sh") $input =~ s/\^//g; return ($input); } # end of translit2koi8() #TODO: probably not to process one-element hash keys in the loop above, # but use much faster 'tr' instead: # #my $singles_from = ''; #my $singles_to = ''; #for (keys (%SINGLES)) #{ # $hex = hex($SINGLES{$_}); # $singles_from .= $_ . uc($_); # $singles_to .= chr($hex) . chr($hex + HEXCAPSHIFT_KOI8); #} #eval ("\$STRING =~ tr/$singles_from/$singles_to/"); # compare two koi8 strings # return: 0, -1, 1 sub koi8_cmp ($$) { my @zero = split (/ */, $_[0]); my @first = split (/ */, $_[1]); my $r; for (my $i=0;; $i++) { if ((! defined $zero[$i]) && (! defined $first[$i])) { return (0); } elsif ((! defined $zero[$i]) && (defined $first[$i])) { return (-1); } elsif ((defined $zero[$i]) && (! defined $first[$i])) { return (1); } if (! exists $ALPHABET_KOI8{$zero[$i]}) { print (STDERR "$zero[$i] in $_[0] is not in alphabet\n"); exit (-1); } if (! exists $ALPHABET_KOI8{$first[$i]}) { print (STDERR "$first[$i] in $_[1] is not in alphabet\n"); exit (-1); } $r = ($ALPHABET_KOI8{$zero[$i]} <=> $ALPHABET_KOI8{$first[$i]}); return ($r) if ($r != 0); } } # end of koi8_cmp() 1; __END__