import sys, os, re

# python3 BAO1.py ../2021 3232

# 3232 : idées
# 3476 : cinéma

def nettoyage(texte): 
    texte_clean = re.sub(r"(<!\[CDATA\[)|(\]\]>$)", "", texte)
    #texte_clean2 = re.sub(r"\.+", ".", texte_clean)
    texte_clean2 = re.sub("&nbsp;", "", texte_clean)
    return texte_clean2

dico_titre = {}

def parcoursarbo(dossier):
    listefichier = os.listdir(dossier)
    regex = re.compile(r"<item><title>(.+?)<\/title>.+?<description>(.+?)<\/description>")

    for doc in listefichier:
            # éviter les fichiers .DS_Store
            if re.match(r"\.\.?", doc):
                continue
            else:
                f = dossier+"/"+doc
                # si c'est un répertoire
                if os.path.isdir(f):
                    #print(f"On entre dans le REPERTOIRE : {f}")
                    # récursivité
                    parcoursarbo(f)
                    #print(f"On sort du REPERTOIRE : {f} \n")
                else :
                    # si c'est un fichier
                    if os.path.isfile(f) :
                            if re.findall(r".*"+rubrique+ r".*" + ".xml", f):
                                fichier = open(f, 'r', encoding="utf-8")
                                    #print(f"traitement du fichier {f} en cours", end="\n")
                                texte = open(f, encoding="UTF-8", errors="ignore").read()
                                for match in regex.finditer(texte):
                                    titre = nettoyage(match.group(1))
                                    description = nettoyage(match.group(2))

                                    # on évite les doublons
                                    if titre not in dico_titre.keys():
                                        dico_titre[titre] = description

                                        sortie_txt.write(titre+"\n")
                                        sortie_txt.write(description+"\n\n")
                                        # sortie_txt.write("\n--------------------------\n")

                                        clean_title = nettoyage(titre)
                                        clean_description = nettoyage(description)

                                        sortie_xml.write(f"<item><titre>{clean_title}</titre><description>{clean_description}</description></item>\n")
                    # else:
                    #     print("erreur pattern")

if __name__ == "__main__":
    # rep = sys.argv[1]
    rep = "data/2021"
    rubrique = sys.argv[1]

    # création d'un fichier pour la rubrique courante s'il n'existe pas
    os.makedirs(f"data/bao1/{rubrique}", exist_ok=True)
    sortie_xml = open(f"data/bao1/{rubrique}/BAO1_{rubrique}.xml", "w", encoding="utf-8")
    sortie_txt = open(f"data/bao1/{rubrique}/BAO1_{rubrique}.txt", "w", encoding="utf-8")
    header = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<corpus>\n"
    sortie_xml.write(header)
    parcoursarbo(dossier=rep)
    sortie_xml.write("</corpus>\n")