#! /usr/bin/perl
# Written by Jon Dehdari 2004
# Perl 5.8
# Stemmer and Syntax Parser for Persian
# The license is the GPL (www.fsf.org)

# The format of the resolve.txt file is as follows:
# 1. Mokassar: 		'ktb	ktAb'    OR    'ktb	ktAb_+PL'
# 2. Preparsed (speed):	'krdn	kr_+dn'
# 3. Don't stem:	'bArAn	bArAn'
# 4. Stop word:		'u	'

use strict;
use utf8;
#use diagnostics;
#binmode(STDOUT, ":utf8");
use LWP::Simple qw(!head);
use CGI qw(:standard); #must use this full line, not just CGI
$CGI::POST_MAX=20000;
my $query = new CGI;

my $input_type    = param ("input_type");
my $width         = param ("width");
my $remove_stops  = param ("remove_stops");
my $use_web_page  = param ("use_web_page");
my $web_page      = param ("web_page");
my $use_file      = param ("use_file");
my $uploaded_file = param ("uploaded_file");
my $text_from     = param ("text_from");
#my $input_type   = "1";
#my $text_from    = "\u{d986}\u{d8a7}\u{d986}\u{d987}\u{d8a7}";
#my $text_from    = "\u{0646}\u{0627}\u{0646}\u{0647}\u{0627}";
#my $text_from    = "&#1606;&#1575;&#1606;&#1607;&#1575;";
#my $text_from    = "قدومي بفارغ الصبر";
#my $text_from    = "CmA nmidAnid.";
#my $use_web_page = "false";
#my $web_page     = "false";
#my $remove_stops = "false";
#my $use_file     = "false";
#my $uploaded_file = "false";
my %unicode2roman;
my $text_from_new;
my @charx;
my $charx;
my $input_rtl;
my $resolve_file;
my %resolve;
my @resolve;
my $resolve;
my $ar_chars    = "EqHSTDZLVU";
#my $longvowel    = "Aui]";

if ($input_type =~ /[012357]/) { $input_rtl = "true"; }

### Checks for stupid width inputs and corrects them
if ($width > 800) { $width = "800"; }
if ($width < 20) { $width = "20"; }

#if ($remove_stops eq "true") {$resolve_file = "resolve_no_stops.txt"; }
#else ($remove_stops ne "true") {$resolve_file = "resolve.txt"; }
$resolve_file = "resolve.txt"; 

if ($input_type eq 4) { print $query->header( -charset => 'ISO-8859-1'); }
#if ($input_type eq 1) { print $query->header( -charset => 'UTF-8'); }
else {  print $query->header( -charset => 'windows-1256'); }


print( 
	'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">',
	"\n",
	'<html lang="en">',
	"\n",
	"\n",
#	"<style>\n",
#	"    body \{ text-align\:right \}\n",
#	"<\/style>\n",
	"<head>\n",
);

if ($input_type eq 1) { print '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'; }
#if ($input_type eq 1) { print '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'; }
if ($input_type eq 3) { print '<meta http-equiv="Content-Type" content="text/html; charset=windows-1256">'; }

print(
        "\n",
	"<title>Persian LG Syntax Parser <\/title>\n",
	"<\/head>\n",
	"<body>\n",
        '<table width="100%" border="1" cellspacing="0" cellpadding="9">',
        "\n",
        "  <tr>\n",
);


# For getting web page stuff
if ($use_web_page eq "true") {
$text_from = "";  # Clears out residue from the web form
$text_from = get "$web_page";
print('</td>
     <td width="100%">
');
#print("$text_from");
}


elsif ($use_file eq "true") {
$text_from = "";  # Clears out residue from the web form
print('</td>
     <td width="100%">
');
while (my $line = <$uploaded_file> ) {
$text_from = $text_from . $line ;
}
#if (length $text_from > 40000 ) {
#  die "Your uploaded file is too big.<br/></td></tr></table></html>\n";
#}
}

if (0 == 1 && $use_web_page ne "true" && $use_file ne "true") {	# Dont use this statement
my $formated_text_from = "$text_from";
$formated_text_from =~ s/\n/<br\/>\n/g; #changes newline to <br>for from text
if ($input_rtl eq "true") { print("   <td width=\"50%\" align=\"right\"><br/>\n"); }
else { print("   <td width=\"50%\"><br/>\n"); }
print("$formated_text_from");
print('
   </td>'); # closes from text side (left side)

#Prints second column, the one with the new stuff
print('
   <td width="50%">
'); 
} # instead, do the following:
print '<td width="100%">';


# Removes HTML tags, and the like
#if (0==1){
if ($use_web_page eq "true") {
  $text_from =~ s/<br>/BbrR/g;
  $text_from =~ s/<p>/BppR/g;
  $text_from =~ s/BbrR{3,}/\n/g;
  $text_from =~ s/BppR{3,}/\n/g;
  $text_from =~ s/BbrR/\n/g;
  $text_from =~ s/BppR/\n/g;
  $text_from =~ s/<.*?>//g;
  $text_from =~ s/<.*?//g;  # Deleses 1st part of line-spanning HTML tags
  $text_from =~ s/.*?>//g;  # Deletes 2nd part of line-spanning HTML tags
  $text_from =~ s/{.*?}//g;
  $text_from =~ s/{.*?//g;  # Deleses 1st part of line-spanning style tags
  $text_from =~ s/.*?}//g;  # Deletes 2nd part of line-spanning style tags
#  $text_from =~ s/BbrR/<br\/>\n/g;
#  $text_from =~ s/BppR/<br\/>\n/g;
  $text_from =~ s/&nbsp;/ /g;
  $text_from =~ s/\x0a/_/g; # for empty lines
  $text_from =~ s/\n/_/g; # for empty lines
  $text_from =~ s/_{3,}/\n/g;
  $text_from =~ s/_/\n/g;
#  $text_from =~ s/[|_]//g; # use this line for extraneous chars
  $text_from =~ s/ {3,}/ /g;
  $text_from =~ s/\.{3,}/\./g;
}
#}

$text_from =~ s/(?<!\&#\w{4})[;]/ _$1_ /g; #Preserves punctuation for semicolon except for unicode decimal &#....;
$text_from =~ s/([.,?!])/ _$1_ /g; #Preserves punctuation


### Converts from native script to Romanized
if ($input_type eq "3") { # Win 1256
    $text_from =~ tr/\xc7\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xff\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d\xec/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkMXIUN~,;?{}\-i/; }

if ($input_type eq "2") { # ISIRI 3342
    $text_from =~ tr/\xc1\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkiMIUN~,;?%{}\-/; }

if ($input_type eq "1") { # UTF-8
#    my $text_from_temp = `echo -e \"$text_from\" | ./utf8_2_roman.pl `;
    my $text_from_temp = `echo -e "$text_from" | ./utf8_2_roman.pl `;
    $text_from = $text_from_temp;

#    $text_from =~ tr/ابپتثجچحخدذرزژسشصضطظعغفقكگلمنوهيَُِآ☿ةکیءۀئؤًّ،؛؟٪/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkiMXIUN~,;?%/; 
#    $text_from =~ tr/\u{0627}\u{0628}\u{067e}\u{062a}\u{062b}\u{062c}\u{0686}\u{062d}\u{062e}\u{062f}\u{0630}\u{0631}\u{0632}\u{0698}\u{0633}\u{0634}\u{0635}\u{0636}\u{0637}\u{0638}\u{0639}\u{063a}\u{0641}\u{0642}\u{0643}\u{06af}\u{0644}\u{0645}\u{0646}\u{0648}\u{0647}\u{064a}\u{064e}\u{064f}\u{0650}\u{0622}\u{263f}\u{0629}\u{06a9}\u{06cc}\u{0621}\u{06c0}\u{0626}\u{0624}\u{064b}\u{0651}\u{060c}\u{061b}\u{061f}\u{066a}\u{200c}/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkiMXIUN~,;?%-/; 
#    $text_from =~ tr/}\u{d8a7}\u{d8a8}\u{d9be}\u{d8aa}\u{d8ab}\u{d8ac}\u{da86}\u{d8ad}\u{d8ae}\u{d8af}\u{d8b0}\u{d8b1}\u{d8b2}\u{da98}\u{d8b3}\u{d8b4}\u{d8b5}\u{d8b6}\u{d8b7}\u{d8b8}\u{d8b9}\u{d8ba}\u{d981}\u{d982}\u{d983}\u{daaf}\u{d984}\u{d985}\u{d986}\u{d988}\u{d987}\u{d98a}\u{d98e}\u{d98f}\u{d990}\u{d8a2}\u{263f}\u{d8a9}\u{daa9}\u{db8c}\u{d8a1}\u{db80}\u{d8a6}\u{d8a4}\u{d98b}\u{d991}\u{d88c}\u{d89b}\u{d89f}\u{d9aa\x{e2808c}/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkiMXIUN~,;?%-/; 
}

if ($input_type eq "0") { # Unicode Decimal
%unicode2roman = ( 
		 '&#1575;' => 'A', '&#9791;' => '|', "&#1576;" => 'b', '&#1577;' => 'P', '&#1662;' => 'p', '&#1578;' => 't', '&#1579;' => 'V', '&#1580;' => 'j', '&#1670;' => 'c', '&#1581;' => 'H', '&#1582;' => 'x', '&#1583;' => 'd', '&#1584;' => 'L', '&#1585;' => 'r', '&#1586;' => 'z', '&#1688;' => 'J', '&#1587;' => 's', '&#1588;' => 'C', '&#1589;' => 'S', '&#1590;' => 'D', '&#1591;' => 'T', '&#1592;' => 'Z', '&#1593;' => 'E', '&#1594;' => 'G', '&#1601;' => 'f', '&#1602;' => 'q', '&#1603;' => 'k', '&#1705;' => 'k', '&#1711;' => 'g', '&#1604;' => 'l', '&#1605;' => 'm', '&#1606;' => 'n', '&#1608;' => 'u', '&#1607;' => 'h', '&#1610;' => 'i', '&#1740;' => 'i', '&#1609;' => 'A', '&#1614;' => 'a', '&#1615;' => 'o', '&#1616;' => 'e', '&#1617;' => '~', '&#1570;' => ']', '&#1569;' => 'M', '&#1611;' => 'N', '&#1571;' => '|', '&#1572;' => 'U', '&#1573;' => '|', '&#1574;' => 'I', '&#1728;' => 'X', '&#1642;' => '%', '&#1548;' => ',', '&#1563;' => ';', '&#1567;' => '?', '&#8204;' => "-", ' ' => ' ', '.' => '.', ':' => ':', );
  @charx = split(/(?=\&\#)|(?=\s)|(?=\n)/, $text_from);
  $text_from = "";
  foreach $charx (@charx)
   {
     $text_from_new = $unicode2roman{$charx}; 
     $text_from = $text_from . $text_from_new;
   }
}


my $word2 = $text_from;
chomp $word2;
my $word =  `echo  \"$word2  \" | ~/public_html/stemmer.pl -u 2>/dev/null `;


##### End #####
    $word =~ s/_([.,;?!])_/$1/g;

### Converts it back from Romanized to native script 
    if ($input_type eq "3") { # Win 1256
      $word =~ s/i /\xec /g; # converts word final yeh's to alef maksura
      $word =~ s/i-/\xec-/g; # converts word final yeh's to alef maksura
      $word =~ s/i$/\xec/g; # converts word final yeh's to alef maksura
      $word =~ tr/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhiaoe\x5d\x7cPkMXIUN~,;?{}\-/\xc7\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xff\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d/; 
    }

#    if ($input_type eq "2") {  # ISIRI 3342
#      $word =~ tr/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkiMXIUN~,;?%{}\-/\xc1\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2X\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/; 
#    }

    if ($input_type eq "1") { # UTF-8
#    $word =~ tr/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhyaoe\x5d\x7cPkiMXIUN~,;?%/ابپتثجچحخدذرزژسشصضطظعغفقكگلمنوهيَُِآ☿ةکیءۀئؤًّ،؛؟٪/; 
    }

    print '<div style="font-family: Monospace">';
    print "\n<pre>\n";
    print "$word\n";
    $word =~ s/\n/<br\/>\n/g;
    $word =~ s/_\+0/ /g;
    $word =~ s/_\+/ /g;
    $word =~ s/-\+_/ /g;
    $word =~ s/\+_/ /g;

#    print "$word ";
    print "$word<br/><br/>\n";
#    system "echo -e \"$word\n\" | ~/public_html/lgparser ~/public_html/persianlg/data/4.0.dict ";
    my $lgout =  `echo -e \"!width=$width\n$word\n\" | ~/public_html/lgparser ~/public_html/persianlg/data/4.0.dict 2>/dev/null `;
    $lgout =~ s/.*Opening.*\n//g;
    $lgout =~ s/.*RETURN.*\n//g;
    $lgout =~ s/\+\+\+\+Time//g;
    $lgout =~ s/linkparser\>//g;
    $lgout =~ s/.*width set to.*\n//g;
    print $lgout;
    print "\n</pre>\n";
    print '</div>';

# }# ends foreach (@word)

print( "\n<br\/>\n<\/td><\/tr><\/table>\n");
print '<br/><a href="../persianlg_linkdefs.html">Definition of Links</a><br/>';
print '<br/><a href="../persianlg.html">Return to Main Page</a><br/>';
print( "\n<\/body>\n<\/html>\n");

