la vie trilingue de l'identité nationale sur le web

Première phase du projet : la rédaction du script.

Noyau de notre projet, le script est essentiellement en Bash et nous a permis d'appliquer une suite de traitements sur nos URLs de départ.

Le Script

Essentiellement en Bash, le script a été élaboré au fur et à mesure de notre avancement dans le projet par ajout ou modification.

		 
		 #!/bin/bash
read REP; 
read tablo; 
motif="\b identité | идентичность | هوية \b"
echo "tableau de liens" > $tablo;
echo " " >> $tablo;
echo "" >> $tablo;
echo "" >> $tablo;
echo "" >> $tablo;
echo "Tableau de données" > $tablo; 



#Variable pour compter les tableaux

j=1;
for fic in `ls $REP`
{ 
	echo "" >> $tablo; 
	echo "" >> $tablo;
	
	#Variable i pour compter les URLs
	
	i=1;
	nbdump=0;
	mkdir -p CONTEXTES/$j ;
	mkdir -p DUMP-TXT/$j ;
	mkdir -p PAGES-ASPIREES/$j ;
	
	for ligne in `cat $REP/$fic` 
	{
	    echo "================================================================";
	    echo "==========TRAITEMENT : $ligne ";
            
			
			#------- on CURL et on determine l'encodage -----------------------------------------
	    
		curl -o ./PAGES-ASPIREES/$j/$i.html  "$ligne";
	    retourcurl=$? ;
	    contenupageaspiree=$(egrep -i -o "(400 )?Bad request|Moved Permanently|s interdit|Not Acceptable" ./PAGES-ASPIREES/$j/$i.html | sort -u);
	    if [[ $contenupageaspiree != "" ]]
		then
		retourcurl="$retourcurl
BAD";
	    fi
	    echo "RETOUR CURL : $retourcurl";
	    if [[ $retourcurl == 0 ]] 
		then
		encodage=$(file -i ./PAGES-ASPIREES/$j/$i.html | cut -d= -f2);
		echo "ENCODAGE initial : $encodage";
		
		#------- on continue en tenant compte de l'encodage fourni par curl---------------------
		
		if [[ $encodage == "utf-8" ]]
		    then
		    lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i-utf8.txt ;
		    egrep -i "$motif" ./DUMP-TXT/$j/$i-utf8.txt > ./CONTEXTES/$j/$i-utf8.txt ;
		    nbOccur=??;
		    nbOccur=$(egrep -o  "$motif" ./DUMP-TXT/$j/$i-utf8.txt | wc -l );
		    perl5.16.3 ./PROGRAMMES/minigrepmultilingue.pl  "UTF-8" ./DUMP-TXT/$j/$i-utf8.txt ./PROGRAMMES/motif-regexp.txt ;
		    mv resultat-extraction.html ./CONTEXTES/$j/$i-utf8.html ;
		    echo "" >> $tablo;
		    cat ./CONTEXTES/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt ;
		    cat ./DUMP-TXT/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt ;
		    let "nbdump+=1";
		    
		else
		    VERIFENCODAGEDANSICONV=$(iconv -l | egrep -io $encodage | sort -u);
		    if [[ $VERIFENCODAGEDANSICONV != "" ]]
			then
			
			
			#-------------------------------------------
			
			
			
			
			# le charset extrait est connu de iconv : on lynxe et on dump !!!
			
			
			
			
			#-------------------------------------------
			echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> connu par inconv, c'est parti ==> lynx, iconv..."
			lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i.txt ;
			echo "ENCODAGE final : $encodage (avant conversion vers utf-8)";
			iconv -f $encodage -t utf-8 ./DUMP-TXT/$j/$i.txt > ./DUMP-TXT/$j/$i-utf8.txt
			egrep -i "$motif" ./DUMP-TXT/$j/$i-utf8.txt > ./CONTEXTES/$j/$i-utf8.txt ;
			nbOccur=??;
			nbOccur=$(egrep -o "$motif" ./DUMP-TXT/$j/$i-utf8.txt | wc -l );
			perl5.16.3 ./PROGRAMMES/minigrepmultilingue.pl  "UTF-8" ./DUMP-TXT/$j/$i-utf8.txt ./PROGRAMMES/motif-regexp.txt ;
			mv resultat-extraction.html ./CONTEXTES/$j/$i-utf8.html ;
			echo "" >> $tablo;
			cat ./CONTEXTES/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt ;
			cat ./DUMP-TXT/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt ;
			let "nbdump+=1";
		else 
			
			
			#-----------------------------------------------------
			
			# la page n'est pas en utf-8 et son encodage detecte par file n'est pas connu de iconv, on cherche un charset...
			
			#-----------------------------------------------------echo "on cherche un charset dans la page aspiree...";
			
			if egrep -i " ==> inconnu par inconv, on ne fait rien"
				echo "" >> $tablo;
			else
			    
				#-------------------------------------------
				
				
				
				
				# le charset extrait est connu de iconv : on lynxe et on dump !!!
				
				
				#-------------------------------------------
				echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> connu par inconv, c'est parti ==> lynx, iconv..."
				lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i.txt ;
				echo "ENCODAGE final : $encodage (avant conversion vers utf-8)";
				iconv -f $encodage -t utf-8 ./DUMP-TXT/$j/$i.txt > ./DUMP-TXT/$j/$i-utf8.txt
				egrep -i "$motif" ./DUMP-TXT/$j/$i-utf8.txt > ./CONTEXTES/$j/$i-utf8.txt ;
				nbOccur=??;
				nbOccur=$(egrep -o "$motif" ./DUMP-TXT/$j/$i-utf8.txt | wc -l );
				perl5.16.3 ./PROGRAMMES/minigrepmultilingue.pl  "UTF-8" ./DUMP-TXT/$j/$i-utf8.txt ./PROGRAMMES/motif-regexp.txt ;
				mv resultat-extraction.html ./CONTEXTES/$j/$i-utf8.html ;
				echo "" >> $tablo;
				
				cat ./CONTEXTES/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt ;
				cat ./DUMP-TXT/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt ;
				let "nbdump+=1";
			    fi
			
			else 
			    echo "Pas de charset detecte : on ne fait rien pour le DUMP... ";
			    echo "" >> $tablo;
			    
			fi
		    fi
                fi
                
				
				
				# fin curl OK
		else
		    echo "" >> $tablo;
	    fi
	  let "i+=1"; 
	}
	echo "" >> $tablo;
	echo "Tableau n° $jfichier : $fic
n°URL URL PAGES ASPIREES Ret. CURL DUMP initial
(non utf-8) DUMP utf-8 CONTEXTES CONTEXTES
HTML NB Occur
$i $ligne $i.html $retourcurl  -  $i-utf8.txt $i-utf8.txt $i-utf8.html $nbOccur
$i $ligne $i.html $retourcurl $i.txt
($encodage) $i-utf8.txt $i-utf8.txt $i-utf8.html $nbOccur
$i $ligne $i.html $retourcurl Encodage
non détecté Encodage
non détecté Encodage
non détecté  -   -   - 
$i $ligne $i.html $retourcurl $i.txt
($encodage) $i-utf8.txt $i-utf8.txt $i-utf8.html $nbOccur
$i $ligne $i.html $retourcurl Encodage
non détecté Encodage
non détecté  -   -   - 
$i $ligne  -  $retourcurl  -   -   -   -   - 
  Fichier DUMP
global
$nbdump fichier(s) Fichier CONTEXTES
global
$nbdump fichier(s)   " >> $tablo; 
	let "j+=1"; 
	echo "
" >> $tablo;
}
echo "" >> $tablo;

Tableau n° $j fichier : $fic
n°URL	URL	PAGES ASPIREES	Ret. CURL	DUMP initial (non utf-8)	DUMP utf-8	CONTEXTES	CONTEXTES HTML	NB Occur
$i	$ligne	$i.html	$retourcurl	-	$i-utf8.txt	$i-utf8.txt	$i-utf8.html	$nbOccur
$i	$ligne	$i.html	$retourcurl	$i.txt ($encodage)	$i-utf8.txt	$i-utf8.txt	$i-utf8.html	$nbOccur
$i	$ligne	$i.html	$retourcurl	Encodage non détecté	Encodage non détecté	Encodage non détecté	-	-	-
$i	$ligne	$i.html	$retourcurl	$i.txt ($encodage)	$i-utf8.txt	$i-utf8.txt	$i-utf8.html	$nbOccur
$i	$ligne	$i.html	$retourcurl	Encodage non détecté	Encodage non détecté	-	-	-
$i	$ligne	-	$retourcurl	-	-	-	-	-
	Fichier DUMP global $nbdump fichier(s)	Fichier CONTEXTES global $nbdump fichier(s)

la vie trilingue de l'identité nationale sur le web

Première phase du projet : la rédaction du script. Noyau de notre projet, le script est essentiellement en Bash et nous a permis d'appliquer une suite de traitements sur nos URLs de départ.

Le Script

Tableau de données

Tableau n° $j

Première phase du projet : la rédaction du script.

Noyau de notre projet, le script est essentiellement en Bash et nous a permis d'appliquer une suite de traitements sur nos URLs de départ.