#! /usr/bin/perl # Written by Jon Dehdari 2004 # Perl 5.6 # Converts Persian and Arabic HTML Unicode text to Romanized Persian/Arabic text # Syntax: ./htmlunicode2roman_1-4.pl < input.html > output.txt %unicode2roman = ( '۰' => '0', # EXTENDED ARABIC-INDIC DIGIT ZERO # Farsi specific '۱' => '1', # EXTENDED ARABIC-INDIC DIGIT ONE # Farsi specific '۲' => '2', # EXTENDED ARABIC-INDIC DIGIT TWO # Farsi specific '۳' => '3', # EXTENDED ARABIC-INDIC DIGIT THREE # Farsi specific '۴' => '4', # EXTENDED ARABIC-INDIC DIGIT FOUR # Farsi specific '۵' => '5', # EXTENDED ARABIC-INDIC DIGIT FIVE # Farsi specific '۶' => '6', # EXTENDED ARABIC-INDIC DIGIT SIX # Farsi specific '۷' => '7', # EXTENDED ARABIC-INDIC DIGIT SEVEN # Farsi specific '۸' => '8', # EXTENDED ARABIC-INDIC DIGIT EIGHT # Farsi specific '۹' => '9', # EXTENDED ARABIC-INDIC DIGIT NINE # Farsi specific 'ا' => 'A', # ARABIC LETTER ALEF '☿' => '|', # ARABIC LETTER ALEF #initial position; fake hex value for now "ب" => 'b', # ARABIC LETTER BEH 'ة' => 'a', # ARABIC LETTER TEH MARBUTA 'پ' => 'p', # ARABIC LETTER PEH 'ت' => 't', # ARABIC LETTER TEH 'ث' => 'V', # ARABIC LETTER THEH 'ج' => 'j', # ARABIC LETTER JEEM 'چ' => 'c', # ARABIC LETTER TCHEH 'ح' => 'H', # ARABIC LETTER HAH 'خ' => 'x', # ARABIC LETTER KHAH 'د' => 'd', # ARABIC LETTER DAL 'ذ' => 'L', # ARABIC LETTER THAL 'ر' => 'r', # ARABIC LETTER REH 'ز' => 'z', # ARABIC LETTER ZAIN 'ژ' => 'J', # ARABIC LETTER JEH 'س' => 's', # ARABIC LETTER SEEN 'ش' => 'C', # ARABIC LETTER SHEEN 'ص' => 'S', # ARABIC LETTER SAD 'ض' => 'D', # ARABIC LETTER DAD 'ط' => 'T', # ARABIC LETTER TAH 'ظ' => 'Z', # ARABIC LETTER ZAH 'ع' => 'E', # ARABIC LETTER AIN 'غ' => 'G', # ARABIC LETTER GHAIN 'ف' => 'f', # ARABIC LETTER FEH 'ق' => 'q', # ARABIC LETTER QAF 'ك' => 'k', # ARABIC LETTER KAF 'ک' => 'k', # ARABIC LETTER KEHEH 'گ' => 'g', # ARABIC LETTER GAF 'ل' => 'l', # ARABIC LETTER LAM 'م' => 'm', # ARABIC LETTER MEEM 'ن' => 'n', # ARABIC LETTER NOON 'و' => 'u', # ARABIC LETTER WAW 'ه' => 'h', # ARABIC LETTER HEH 'ي' => 'i', # ARABIC LETTER YEH 'ی' => 'i', # ARABIC LETTER FARSI YEH 'ى' => 'A', # ARABIC LETTER ALEF MAKSURA 'َ' => 'a', # ARABIC FATHA 'ُ' => 'o', # ARABIC DAMMA 'ِ' => 'e', # ARABIC KASRA 'ّ' => '~', # ARABIC SHADDA 'آ' => ']', # ARABIC LETTER ALEF WITH MADDA ABOVE 'ء' => 'M', # ARABIC LETTER HAMZA 'ً' => 'N', # ARABIC FATHATAN 'أ' => '|', # ARABIC LETTER ALEF WITH HAMZA ABOVE # temp 'ؤ' => 'U', # ARABIC LETTER WAW WITH HAMZA ABOVE 'إ' => '|', # ARABIC LETTER ALEF WITH HAMZA BELOW # temp 'ئ' => 'I', # ARABIC LETTER YEH WITH HAMZA ABOVE 'ۀ' => 'X', # ARABIC LETTER HEH WITH YEH ABOVE '٪' => '%', # ARABIC PERCENT SIGN '،' => ',', # ARABIC COMMA '؛' => ';', # ARABIC SEMICOLON '؟' => '?', # ARABIC QUESTION MARK '‌' => "-", # ZERO WIDTH NON-JOINER ' ' => ' ', # SPACE '(' => '(', # LEFT PARENTHESIS ')' => ')', # RIGHT PARENTHESIS '.' => '.', # FULL STOP ':' => ':', # COLON "\n" => "\n", # LINE FEED ); #$in = shift || die "Provide a valid input file argument\n"; #$out= shift || die "Provide an output file argument\n"; #$in ne $out || die "Input and output files cannot be the same\n"; #open(IN,$in); #open(OUT,">$out"); while ($line = ) { $line =~ s/ ا/ ☿/g; # Converts a regular alef into a basic placeholder alef, to be used with the authors romanization scheme eg. Au -> |u . $line =~ s/^ا/☿/g; # Same, at beginning of a line $line =~ s/ آ/ ا/g; # Same, at beginning of a line $line =~ s/^آ/ا/g; # Same, at beginning of a line #$line =~ s/
/\n/g; $line =~ s/

/\n/g; $line =~ s/<\/td>/\n/g; $line =~ s/<\/div>/\n/g; $line =~ s/<.*?>//g; # Deletes all HTML tags on 1 line $line =~ s/<.*?//g; # Deleses 1st part of line-spanning HTML tags $line =~ s/.*?>//g; # Deletes 2nd part of line-spanning HTML tags $line =~ s/body { text-align:right }//g; # Removes stylesheet stuff @charx = split(/(?=\&\#)|(?=\s)|(?=\n)/, $line); foreach $charx (@charx) { #print "$charx "; $newchar = $unicode2roman{$charx}; print STDOUT ($newchar); #print "\n"; } #prints a newline after every line # print STDOUT ("
\n"); } #prints the bottom part of the html page #print STDOUT ( "\n<\/body>\n<\/html>\n"); #close (IN); #close (OUT);