#!/usr/bin/perl # # Written by Jon Dehdari 2004-2005 # Perl 5.8 # Stemmer and Morphological Parser for Persian # The license for this stemmer only is the LGPLv2.1 (www.fsf.org) # # The format of the resolve.txt file is as follows: # 1. Mokassar: 'ktb ktAb' OR 'ktb ktAb_+PL' # 2. Preparsed (speed): 'krdn kr_+dn' # 3. Don't stem: 'bArAn bArAn' # 4. Stop word: 'u ' use strict; use Getopt::Long; #use diagnostics; my $version = "0.4.5"; my $date = "2005/06/05"; my $copyright = "(c) 2004-2005 Jon Dehdari - GPL v2."; my $title = "Persian stemmer $version, $date - $copyright"; my $resolve_file = "resolve.txt"; my $recall = 0; my $show_links = 0; my $show_only_root = 0; my $tokenize = 0; my $unvowel = 0; my @line; my %resolve; my @resolve; my $resolve; my $ar_chars = "EqHSTDZLVU"; #my $longvowel = "Aui]"; my $usage = <<"END_OF_USAGE"; ${title} Syntax: perl $0 [options] < input > output Function: Stemmer and morphological analyzer for the Persian language. Inflexional morphemes are separated from their roots. Options: -h, --help Print usage -l, --links Show morphological links -r, --root Return only word roots -R, --recall Increase recall by parsing ambiguous affixes -s, --stoplist Use stopword list (default: ./resolve.txt) -t, --tokenize Tokenize punctuation -u, --unvowel Remove short vowels -v, --version Print version END_OF_USAGE GetOptions( 'h|help|?' => sub { print $usage; exit; }, 'l|links' => \$show_links, 'r|root' => \$show_only_root, 'R|recall' => \$recall, 's|stoplist:s' => \$resolve_file, 't|tokenize' => \$tokenize, 'u|unvowel' => \$unvowel, 'v|version' => sub { print "$version\n"; exit; }, ) or die $usage; #open RESOLVE, "$resolve_file"; #while ($resolve = ) { # chomp $resolve; # @resolve = split /\t/, $resolve; # %resolve = ( %resolve, "$resolve[0]" => "$resolve[1]" , ); #} while ($_ = <>) { chomp $_; if ( $unvowel ) { $_ =~ s/\b([aeo])/|/g; # Inserts alef before words that begin with short vowel $_ =~ s/\bA/]/g; # Changes long 'aa' at beginning of word to alef madda $_ =~ s/[aeo~]//g; # Finally, removes all other short vowels and tashdids } #Inserts ZWNJ's where they should have been originally, but weren't $_ =~ s/(?)