from os import listdir
import re
def concat_dump ():
dossier_dt = "C:\\Users\\elisa\\Downloads\\Projet encadré\\PROJET-MOT-SUR-LE-WEB\\DUMP-TEXT\\"
concat_dump_en1 = open("concat_dump_en1.xml", "w", encoding="utf-8")
concat_dump_en2 = open("concat_dump_en2.xml", "w", encoding="utf-8")
concat_dump_french = open("concat_dump_french.xml", "w", encoding="utf-8")
concat_dump_ru = open("concat_dump_ru.xml", "w", encoding="utf-8")
for dossiers in listdir(dossier_dt):
if dossiers == "dump_en1":
print("ok")
for fichiers in listdir(dossier_dt+dossiers):
print(fichiers)
with open(dossier_dt+dossiers+"/"+fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^(\*|\+|o|IFRAME|\(BUTTON\)|#|\[|_|\().*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = txt.lower()
txt = re.sub(r"\n", r"</ligne>\n<ligne>", txt, flags=re.MULTILINE)
concat_dump_en1.write(f'<?xml version="1.0" encoding=\"UTF-8\"?>'+"\n"+'<fichier=\"'+str(fichiers)+'\">'+"\n"+"<ligne>"+txt+"</fichier>"+"\n")
elif dossiers == "dump_en2":
print("ok")
for fichiers in listdir(dossier_dt + dossiers):
print(fichiers)
with open(dossier_dt + dossiers + "/" + fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^(\*|\+|o|IFRAME|\(BUTTON\)|#|\[|_|\().*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = txt.lower()
txt = re.sub(r"\n", r"</ligne>\n<ligne>", txt, flags=re.MULTILINE)
concat_dump_en2.write(f'<?xml version="1.0" encoding=\"UTF-8\"?>'+"\n"+'<fichier=\"'+str(fichiers)+'\">'+"\n"+"<ligne>"+txt+"</fichier>"+"\n")
elif dossiers == "dump_fr":
print("ok")
for fichiers in listdir(dossier_dt+dossiers):
print(fichiers)
with open(dossier_dt+dossiers+"/"+fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^(\*|\+|o|IFRAME|\(BUTTON\)|#|\[|_|\().*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = txt.lower()
txt = re.sub(r"\n", r"</ligne>\n<ligne>", txt, flags=re.MULTILINE)
concat_dump_french.write(f'<?xml version="1.0" encoding=\"UTF-8\"?>'+"\n"+'<fichier=\"'+str(fichiers)+'\">'+"\n"+"<ligne>"+txt+"</fichier>"+"\n")
elif dossiers == "dump_ru":
print("ok")
for fichiers in listdir(dossier_dt + dossiers):
print(fichiers)
with open(dossier_dt + dossiers + "/" + fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^(\*|\+|o|IFRAME|\(BUTTON\)|#|\[|_|\().*", "", txt, flags=re.MULTILINE)
txt = re.sub(r"^\s*", "", txt, flags=re.MULTILINE)
txt = txt.lower()
txt = re.sub(r"\n", r"</ligne>\n<ligne>", txt, flags=re.MULTILINE)
concat_dump_ru.write(f'<?xml version="1.0" encoding=\"UTF-8\"?>'+"\n"+'<fichier=\"'+str(fichiers)+'\">'+"\n"+"<ligne>"+txt+"</fichier>"+"\n")
def concat_contxt():
dossier_contxt = "C:\\Users\\elisa\\Downloads\\Projet encadré\\PROJET-MOT-SUR-LE-WEB\\CONTEXTES\\"
concat_contxt_en1 = open("concat_contxt_en1.txt", "w", encoding="UTF-8")
concat_contxt_en2 = open("concat_contxt_en2.txt", "w", encoding="UTF-8")
concat_contxt_french = open("concat_contxt_french.txt", "w", encoding="UTF-8")
concat_contxt_ru = open("concat_contxt_ru.txt", "w", encoding="UTF-8")
for dossiers in listdir(dossier_contxt):
if dossiers == "contexte_en1":
for fichiers in listdir(dossier_contxt+dossiers):
if fichiers.endswith(".txt"):
with open(dossier_contxt+dossiers+"/"+fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
concat_contxt_en1.write(txt)
elif dossiers == "contexte_en2":
for fichiers in listdir(dossier_contxt+dossiers):
if fichiers.endswith(".txt"):
with open(dossier_contxt+dossiers+"/"+fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
concat_contxt_en2.write(txt)
elif dossiers == "contexte_fr":
for fichiers in listdir(dossier_contxt+dossiers):
if fichiers.endswith(".txt"):
with open(dossier_contxt+dossiers+"/"+fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
concat_contxt_french.write(txt)
elif dossiers == "contexte_ru":
for fichiers in listdir(dossier_contxt+dossiers):
if fichiers.endswith(".txt"):
with open(dossier_contxt+dossiers+"/"+fichiers, encoding="UTF-8", errors="ignore") as txt:
txt = txt.read()
concat_contxt_ru.write(txt)
concat_dump()
concat_contxt()