L’attractivité de l'économie de la Birmanie

Télécharcher le script

making it better:

make more independent subprograms (html_table is bit messy, but I bit lazy to do it now)
reading the keywords from a file or standard input from user (60)
maybe using references in perl it will be whole faster
...

problems and solutions:

(60) keyword in burmese have should be in this format: "x{1005}\x{102E}\x{1038}\x{1015}\x{103C}\x{102C}\x{1038}\x{1031}\x{101B}\x{1038}" not as "\xe1\x80\x85\xe1\x80\xae\xe1\x80\xb8\xe1\x80\x95\xe1\x80\xbc
\xe1\x80"(which was used in bash file) I'm not that sure why, but probably because I'm reading files as utf8, but I'm not sure. But anyway, like this it works prefectely.
I've got a typic problem with encoding, but I think it's more than good resolved using two modules in perl (Decode and Guess) and after all converting html entities into utf-8 (using Entities)
if a link is not downloaded its skipped and program pass onto next one
there are some cases when occurence wasn't find in dumped filed, it's caused probably by bad stuction of webpage (link inside link or too dynamic content, I don't know) usually is downloaded just title of page

Code source Comments

#!/usr/bin/perl
#use warnings;
use LWP::Simple;
use Encode;
require Encode::Detect;
use Encode::Guess;
use Data::Dumper;
use Time::Stopwatch;
use HTML::TreeBuilder;
use HTML::Entities;
mkdir("../CONTEXTS");
mkdir("../DUMP_TEXT");
mkdir("../IMAGES");
mkdir("../PAGES_ASPIREES");
mkdir("../TABLEAUX");
opendir(DIR, "../URLS") or die "I cannot open dir: $!";
open(ERROR,">:utf8","error_reading_url.txt") or die "I'm not able to open a file.";
print "\noperation started:\n";
while (my $file = readdir(DIR)) { #open directory with urls
next unless ($file =~ m/\.txt$/); #open each file.txt
print "\n-> processing urls (".$file.")";
tie my $timer, 'Time::Stopwatch';
html_table($file); #processing html file
print "\n done in $timer s";
}
close DIR;
close ERROR;
print "\n-> dumping results together";
dump_globaux();
dump_globaux_context();
tabs_globaux();
print "\n\nsuccesfully done\n";
print "(list of wrong urls here: error_reading_url.txt)\n";
##################################################
# SUBPROGRAMS #
##################################################
sub html_table {
# create a html-table with urls and downloaded links
# ARGUMENTS: file
my $file=$_[0];
my $lang=substr($file,0,-4);
my $n=1;
my $nf=0;
my $occurences;
my %motivy = ("allemand" => "inve[rs]t", "anglais" => "inve[rs]t", "espagnol" => "inve[rs]t", "francais" => "invest", "myanmar" => "\x{1005}\x{102E}\x{1038}\x{1015}\x{103C}\x{102C}\x{1038}\x{1031}\x{101B}\x{1038}", "norvegien" => "invest", "tcheque" => "invest");
my $motif = $motivy{$lang}; #searched word
open(INPUT,"<:utf8","../URLS/".$file) or die "I'm not able to open a file."; #open file with urls
open(EXP,">:utf8","../TABLEAUX/".$lang.".html") or die "I'm not able to open a file."; #create html file named by language
mkdir("../PAGES_ASPIREES/".$lang); #create directory for downloaded pages
mkdir("../DUMP_TEXT/".$lang); #create directory for dumped pages
print EXP "<html>\n\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/htm;charset=utf-8\">\n<title>$lang</title>\n</head>\n\n<body bgcolor=\"#003\">\n\n<table align=\"center\" border=\"1\">\n<tr align=\"center\" bgcolor=\"#D2E6FF\"><td width=\"20px\">n&deg</td><td>URL</td><td width=\"90px\">PAGES ASPIREES</td><td width=\"100px\">DUMP initial</td><td width=\"100px\">DUMP utf-8</td><td width=\"100px\">CONTEXTE utf-8 motif: <blue>$motif</blue></td><td width=\"100px\">CONTEXTE HTML motif: <blue>$motif</blue></td><td width=\"60px\">OCCS</td></tr>\n";
while(<INPUT>) {
#reading each url file and processing it
chomp($_);
my $url=$_;
$tmp=substr($_,0, 30);
my $charset = download_html($_,$lang,$n); #download and charset
my $occs = context_txt_html($n,$lang,$motif);
$occurences = $occurences + $occs;
if ($charset ne "not downloaded") {
#if page is not downloaded: row in table is not created, file is owerwrited by next successfully downloaded page
print EXP "<tr align=\"center\" bgcolor=\"#D2E6FF\"><td>$n</td><td><a href=\"$url\">$tmp</a></td><td><a href=\"../PAGES_ASPIREES/$lang/$n.html\">$n.html</a></td><td>";
if ($charset ne "UTF-8" or $charset ne "utf8") {
print EXP "<a href=\"../DUMP_TEXT/$lang/$n.txt\">$n.txt</a> ($charset)";
}
print EXP "</td><td><a href=\"../DUMP_TEXT/$lang/$n-utf8.txt\">$n-utf8.txt</a></td>";
if ($occs != 0) {
print EXP "<td><a href=\"../CONTEXTS/$lang/$n-utf8.txt\">$n-utf8.txt</a></td><td><a href=\"../CONTEXTS/$lang/$n-utf8.html\">$n-utf8.html</a></td><td>$occs</td></tr>\n";
$nf++;
}
else{
print EXP "<td bgcolor=\"#B0C4DE\">not found</td><td bgcolor=\"#B0C4DE\">not found</td><td bgcolor=\"#B0C4DE\">$occs</td></tr>\n";
}
$n++;
#svetlejsi #D2E6FF
#tmavsi #B0C4DE
}
}
print EXP "<tr align=\"center\" bgcolor=\"#B0C4DE\"><td></td><td></td><td></td><td></td><td></td><td bgcolor=\"#D2E6FF\"><a href=\"../FICHIERGLOBAUX/DUMP-GLOBAUX_$lang.txt\">Fichier DUMP global</a> $nf fichier(s)</td><td bgcolor=\"#D2E6FF\"><a href=\"../FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$lang.txt\">Fichier CONTEXTES global</a> $nf fichier(s)</td><td bgcolor=\"#D2E6FF\">$occurences</td></tr>\n";
print EXP "</table>\n\n</body></html>";
close INPUT;
close EXP;
}
sub download_html {
# download each html file which got from url and return its encoding
# ARGUMENTS: url, language, number
# RETURNS: encoding
my $content = get($_[0]) or print ERROR $_[2]."\t".$_[1]."\t".$_[0]."\n"; #download of content of url
my $enc;
if ($content) {
#saving of html file and getting encoding
open(S,">:utf8","../PAGES_ASPIREES/".$_[1]."/".$_[2].".html") or die "I'm not able to open a file.";
print S $content; #saved to file
close S;
$enc= Encode::Detect::Detector::detect($content); #getting encoding
if ($enc eq "") { #guessing encoding
$x = Dumper(guess_encoding($content)->name);
$x =~ /\'(.*)\'/;
$enc = $1;
}
txt_save($content,$_[1],$_[2],$enc);
#dumping file (utf8 and if necessary originall)
}
else {
$enc = "not downloaded";
}
return $enc; #returns encoding
}
sub txt_save {
# saves a dumped text
# ARGUMENTS: html-content, language, number, charset
open(TXTU,">:utf8","../DUMP_TEXT/".$_[1]."/".$_[2]."-utf8.txt") or die "I'm not able to create dumped txt-utf8 file.";
if ($_[3] ne "UTF-8" and $_[3] ne "utf8") {
#non utf-8 => save and convert
open(TXT,">:utf8","../DUMP_TEXT/".$_[1]."/".$_[2].".txt") or die "I'm not able to create dumped txt file.";
print TXT delete_html($_[0]);
$_[0]= decode($_[3], $_[0] );
}
print TXTU decode_entities(delete_html($_[0]));
close TXTU;
}
sub delete_html{
# delete every html tag
# ARGUMENTS: html-content
# RETURNS: content-without-html
my $tree = HTML::TreeBuilder->new;
$tree->parse($_[0]);
$non_html = $tree->as_text();
return $non_html;
}
sub context_txt_html {
# finds a KWIC and print it into TXT, HTML
# ARGUMENTS: number, language, motif
# RETURNS: number-of-occurence
my $n=$_[0];
my $lang=$_[1];
my $motif=$_[2];
my $context=200;
my $x = 0;
#print "\ncislo souboru: $n\tjazyk: $lang\tmotiv: $motif";
mkdir("../CONTEXTS/".$lang); #create directory for contexts
#print "\nadresar vytvoren";
open(UTF_TXT,"<:utf8","../DUMP_TEXT/".$lang."/".$n."-utf8.txt"); #or print "\n\tI'm not able to read dumped file $n-utf.8.txt, but I keep working, you don't need to be worry about."; #pokud problem, tak smazat ten posledni konstrukt
open(CONT_TXT,">:utf8","../CONTEXTS/".$lang."/".$n."-utf8.txt") or die "I'm not able to create context $n-utf.8.txt.";
open(CONT_HTML,">:utf8","../CONTEXTS/".$lang."/".$n."-utf8.html") or die "I'm not able to create context $n-utf.8.html.";
#print "\nsoubory otevreny";
print CONT_HTML "<html>\n<head>\n<title>minigrep : $motif ($lang)</title>\n";
print CONT_HTML "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf8\" />\n";
print CONT_HTML "<body bgcolor=\"white\">\n";
print CONT_HTML "Input file : $n-utf.txt \n";
print CONT_HTML "KeyWord : $motif \n";
print CONT_HTML "Language : $lang \n";
print CONT_HTML "Number of context-characters : +- $context\n<hr><hr>\n";
print CONT_TXT "LEFT CONTEXT\tKEY-WORD\tRIGHT CONTEXT\n";
#print "\nposledni radka vytisknuta, samotne zpracovani UTF_TXT";
while(<UTF_TXT>) {
chomp($_);
#print "oriznuto";
while ($_ =~/($motif[^ ]*)/gi) {
#searching for keyword
#print "motiv nalezen";
$x++;
my $cl = $context;
my $cr = $context;
my $motif_ok= $1; #potřebuju to tam?
my $left = $`; #potřeba vypsat jenom xy znaků před a po keyword
my $right = $';
### left context
if (length($left)>$cl) {
until (substr($left,-$cl-1,1) eq " ") { #gets first whole word in a row
$cl=$cl-1;
}
$left = substr($left,-$cl,$cl); #print xy first characters, beging with whole word
}
### right context
if (length($right)>$cr) {
until (substr($right,$cr+1,1) eq " ") {
$cr=$cr-1;
}
$right = substr($right,0,$cr);
}
### printing
print CONT_TXT $left."\t".$motif_ok."\t".$right."\n";
print CONT_HTML "$left$motif_ok$right\n<hr>\n";
}
print CONT_HTML "</body></html>\n";
close CONT_TXT;
close CONT_HTML;
close UTF_TXT;
}
if ($x == 0){
unlink("../CONTEXTS/".$lang."/".$n."-utf8.txt");
unlink("../CONTEXTS/".$lang."/".$n."-utf8.html");
}
return $x;
}
sub dump_globaux {
#makes global dumped file (from DUMP_TEXT) for every language (directory), containg every dumped file, saves into created file FISCHIERGLOBAUX
mkdir("../FICHIERGLOBAUX/");
opendir(DIR, "../DUMP_TEXT") or die "I cannot open dir: $!";
while (my $dir = readdir(DIR)) { #open directory with urls
next unless ($dir !~ m/\./);
opendir(DIRE, "../DUMP_TEXT/".$dir) or die "I cannot open dir: $!";
open(DUMP,">:utf8","../FICHIERGLOBAUX/DUMP-GLOBAUX_".$dir.".txt") or die "I'm not able to create dump_globaux for $dir.";
while (my $file = readdir(DIRE)) { #open directory with urls
next unless ($file =~ m/\.txt$/);
open(INPUT,"<:utf8","../DUMP_TEXT/".$dir."/".$file) or die "I'm not able to open a file $file in directory $dir.";
while(<INPUT>) {
print DUMP $_;
}
close INPUT;
}
closedir DIRE;
close DUMP;
}
closedir DIR;
}
sub dump_globaux_context {
#makes global dumped file (from CONTEXTS) for every language (directory), containg every context-file (whithout headder), saves into FISCHIERGLOBAUY
opendir(DIR, "../CONTEXTS") or die "I cannot open dir: $!";
while (my $dir = readdir(DIR)) { #open directory with urls
next unless ($dir !~ m/\./);
opendir(DIRE, "../CONTEXTS/".$dir) or die "I cannot open dir: $!";
open(DUMP,">:utf8","../FICHIERGLOBAUX/CONTEXTES-GLOBAUX_".$dir.".txt") or die "I'm not able to create dump_globaux for $dir.";
while (my $file = readdir(DIRE)) { #open directory with urls
next unless ($file =~ m/\.txt$/);
open(INPUT,"<:utf8","../CONTEXTS/".$dir."/".$file) or die "I'm not able to open a file $file in directory $dir.";
while(<INPUT>) {
next unless ($_ !~ m/RIGHT CONTEXT$/);
print DUMP $_;
}
close INPUT;
}
closedir DIRE;
close DUMP;
}
closedir DIR;
}
sub tabs_globaux {
#dumping tableaux files into one (taking every line which begin with "<TR"), giving header
opendir(DIR, "../TABLEAUX") or die "I cannot open dir: $!";
open(EXP,">:utf8","../TABLEAUX/tabs_globaux.html") or die "I'm not able to create dump_globaux for $dir.";
my $n=0;
print EXP "<html>\n\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/htm;charset=utf-8\">\n<title>Tableux_globaux</title>\n</head>\n\n<body bgcolor=\"#4B0082\">\n\n";
while (my $file = readdir(DIR)) { #open directory with urls
next unless ($file =~ m/\.html/ and $file ne "tabs_globaux.html");
my $fic=substr($file,0,-5);
$n++;
print EXP "<table align=\"center\" border=\"1\">\n";
print EXP "<tr><td colspan=\"8\" align=\"center\" bgcolor=\"black\">Tableau n&deg $n\ fichier : $fic.txt</td></tr>";
open(INPUT,"<:utf8","../TABLEAUX/".$file) or die "I'm not able to open a file $file in directory $dir.";
while(<INPUT>) {
next unless ($_ =~ m/<tr/);
print EXP $_;
}
close INPUT;
print EXP "</table>\n\n <hr> \n\n";
}
print EXP "</body></html>";
closedir DIR;
close EXP;
}

SUBPROGRAM

HTML_TABLE

SUBPROGRAM DOWNLOAD

_HTML

SUBPROGRAM TXT_SAVE

SUBPROGRAM DELETE_HTML

SUBPROGRAM CONTEXT_TXT_HTML

SUBPROGRAM DUMP_GLOBAUX

SUBPROGRAM DUMP_GLOBAUX_CONTEXT

SUBPROGRAM TABS_GLOBAUX

L’attractivité de l'économie de la Birmanie

Le script PERL

making it better:

problems and solutions: