Exo3_xml.py

import re,subprocess
from datetime import datetime
from codeudpipe import * 
def myMain(fichier, modele):
    with open(fichier,"r", encoding="utf8") as entree, open("resultat.xml","w", encoding="utf8") as sortie:
        contenu=entree.read()
        entete,texte,name,title = header(contenu, modele) #On traite le hader et on renvoie le header et le reste du texte
        texte=re.sub("<[^>]+>", "",texte) #On élimine les balises du reste pour avoir un txt
        open("relai.txt","w").write(texte)
        sortie.write(entete)  #On écrit l'en tête
        #On run Udpipe sur le texte brut en tant que commande externe pour eviter conflits de module
        subprocess.run("python3 codeudpipe.py --input relai.txt --output Jules_Verne2.conllu --language fr" , check=True, shell= True)
        a=open("Jules_Verne2.conllu", encoding="utf8")  #On récupere le resultat de udepipe
        tagging(a.read(),sortie,name,title) #Et on rajoute les balises dessus
        a.close()
        sortie.write("\n</TEI>") 
        

def header(contenuxml, modele):
    head=re.findall("<\?xml.*\?>", contenuxml,re.DOTALL)[0]
    head=re.sub("href=\".* ","href=\"monschemaTEI.rng\"",head)
    head= head.replace(re.findall("schematypens=.*?\?>" ,head,re.DOTALL)[0], 'schematypens="http://relaxng.org/ns/structure/1.0"?>')
    head= head.replace(re.findall("schematypens=.*?\?>" ,head,re.DOTALL)[1], 'http://purl.oclc.org/dsdl/schematron?>\n')
    heady=re.findall("<TEI xmln.*</teiHeader>", contenuxml,re.DOTALL)[0]
    heady=heady.replace(" </title>" , " 2</title>").replace("</titleStmt>","<resp>Tokenisation, lemmatisation and POS-tagging with UDPipe by</resp><name>Valentin-Gabriel Soumah</name>\n</titleStmt>")
    heady=heady.replace('encodingDesc n="eltec-1"', 'encodingDesc n="eltec-2"')
    heady=heady.replace('</revisionDesc>','<change when="'+datetime.now().strftime("%Y-%m-%d")+'">upgrade to ELTeC-2</change>\n</revisionDesc>')
    auteur=re.search("<author.*?>(.*?)</author>", heady).group(1)
    titre=re.search("<title.*?>(.*?)</title>", heady).group(1)
    texte=re.findall("<text>.*</text>",contenuxml,re.DOTALL)[0]
    return head+heady,texte,auteur,titre

def tagging(texte,sortie,auteur,titre):
    if True:
        sortie.write("<front>\n<div type=\"titlepage\"><p>"+auteur+"</p>\n<p>"+titre+"</p>\n</div>\n")
        sortie.write('<div type="liminal">"')
        longpara=len(texte.split("\n\n"))
        for i,paragraphe in enumerate(texte.split("\n\n")):
            if i <longpara-1:
                sortie.write("<p>\n")
            for ligne in paragraphe.split("\n"):
                #print(ligne)
                if  ligne.startswith("#"): 
                    if ligne.startswith("# sent_id"): #On  rajoute les balise et supprime les espaces
                        lignef='<s xml:id="s'+ligne.strip("# sent_id = ")+'">'
                    else: continue
                else:
                    if ligne== "\n" or not ligne : continue
                    colonnes=ligne.split("\t")[:6] #On divise la ligne en colonne
                    colonnes[2]=colonnes[2].replace('"','&quot;')
                    colonnes[1]=colonnes[1].replace('"','&quot;')
                    if colonnes[3] == "PUNCT":
                        sortie.write("<pc>"+colonnes[2]+"</pc>\n")
                        continue
                    colonnes[0]="<w" #Et on rajoute balises et attributs
                    colonnes[2]='lemma="'+colonnes[2]+'"'
                    
                    colonnes[3]='pos="'+colonnes[3]+'"'
                    del colonnes[4]
                    if colonnes[4] == "_": #Attribut optionnel
                        del colonnes[4]
                    else:
                        for attribut in colonnes[4].split("|"):
                            #print(attribut)
                            option = re.search("(.+?)=(.+?)", attribut)
                            option = option.group(1)+'="'+option.group(2) +'"'
                            colonnes.append(option)
                        del colonnes[4]
                    colonnes.append(">"+colonnes.pop(1))
                    lignef="\t".join(colonnes) + "</w>"
                sortie.write(lignef+"\n")
            if i<longpara -1:
                sortie.write("</s>\n</p>\n")
        sortie.write("</div>\n</front>")

if __name__ = "__main__":
    myMain("FRA04002_Verne.xml", "en attente")