Script Python pour le tableau HTML

#!usr/bin/python3 

import re, os, nltk, fugashi, string
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup
import requests
from urllib.request import Request, urlopen

################ Declarations 
# List of used directories path 
context_dir = "../CONTEXTES/"
dump_dir = "../DUMP-TEXT/"
images_dir = "../IMAGES/"
pages_dir = "../PAGES-ASPIREES/"
tables_dir = "../TABLEAUX/"
urls_dir = "../URLS/"
source_dir ="../V-RESSOURCES/"

# Punctuation variables
punct = list(string.punctuation)
punct.append("—")
punct.append("...")
punct.append("–")
punct.append("…")

#Most of the japanese punctuation
jp_punct = (
[chr(i) for i in range(0x3000,0x3041)] + 
[chr(i) for i in range(0x30fb,0x3100)] + 
[chr(i) for i in range(0xff1a,0xff21)] + 
[chr(i) for i in range(0xff3b,0xff41)] +
[chr(i) for i in range(0xff5b,0xff66)]
)
jp_punct.append('(')
jp_punct.append(')')
jp_punct.append('“')
jp_punct.append('”')
jp_punct.append('!')

punct = punct + jp_punct

#Japanese stop_words list
stop_words = []
with open(f'{source_dir}stopwords_jp.txt', 'r', encoding='utf-8') as f :
    reader = f.readlines()
    for word in reader :
        word = word.strip()
        stop_words.append(word)

################ HTTP requests
def scrape_header(url):
    """
    Sends my informations to the websites so they know or don't ban my IP from 
    scraping datas.
    Parameter :
        url (str) : need an url
    """
    headers = {'User-Agent': 'Mozilla/5.0'}
    request = Request(url, headers=headers)

    return request

def url_code_http (url) :
    """
    Gets the http code from a url request
    Parameter :
        url (str) : needs an url
    """
    url = url.replace("\n", "")
    info = requests.get(url)
    code_http= info.status_code
    
    return code_http

################# Scraping the websites
def open_url(url):
    """
    Opens the url and returns the method to process it.
    Parameter : 
        url (str) : need a url
    """
    link = scrape_header(url)
    html =urlopen(link) 

    return BeautifulSoup(html, 'lxml')

def get_html_page (url):
    """
    Absorbs the entire content of a website on a hardrive
    Parameter :
        url (bs4.BeautifulSoup) : Need the result of BeautifulSoup 
    """

    soup = open_url(url)
    text_html = soup.prettify()

    f=open(f"{pages_dir}/{count_table}-{count_files}.html", 'w', encoding='utf-8')
    f.write(text_html)
    f.close()

def get_dump_text (url):
    """
    Gets all the raw text from a html file into a text file. 
    Parameter : 
        url (bs4.BeautifulSoup) : Need the result of BeautifulSoup
    """

    soup = open_url(url)
    dump_text = soup.get_text()

    w=open(f"{dump_dir}{count_table}-{count_files}.txt",'w', encoding='utf-8')
    w.write(dump_text)
    w.close()

    return dump_text

def url_char_encoding (url):
    """
    Gets the original encoding from the website (written in the html)
    as BeautifulSoup converts automatically texts into utf-8.
    Parameter :
        url (bs4.BeautifulSoup) : Need the result of BeautifulSoup
    """

    soup= open_url(url)

    return soup.original_encoding

################# This part builds a html table
################# The function continue_table() retrieves all the informations from this program other functions
def start_html():
    file=open(f"{tables_dir}tableau-{count_table}.html", 'w', encoding='utf-8') 
    file.write( """<!DOCTYPE html>\n
                <html>\n
                <head>\n
                <title>Projet sur le Web : Alcool</title>\n
                <meta charset = 'utf-8'/>
                </head>\n
                <body>\n\t""")
    file.close()

def end_html():
    file=open(f"{tables_dir}tableau-{count_table}.html", 'a', encoding='utf-8') 
    file.write( """</body>\n
                </html>""")
    file.close()

def html_table():
    file=open(f"{tables_dir}tableau-{count_table}.html", 'a', encoding='utf-8') 
    file.write( """<table align=\"center\" border=\"1px\" bordercolor=blue>\n
                <tr><td>N°</td><td>http-code</td><td>Encodage</td><td>URL</td>\
                <td>Page aspirée</td><td>Filtrage html</td><td>Index</td><td>Bigramme</td><td>Contexte</td></tr>""")
    file.close()

def continue_table():
    file=open(f"{tables_dir}tableau-{count_table}.html", 'a', encoding='utf-8') 
    file.write( f"""<tr><td>{count_files}</td><td>{code_http}</td><td>{url_encoding}</td>\
        <td><a href=\"{line}\">{line}</a></td><td><a href=\"{pages_dir}/{count_table}-{count_files}.html\">{count_table}-{count_files}</a></td>\
            <td><a href=\"{dump_dir}{count_table}-{count_files}.txt\">{count_table}-{count_files}</a></td>\
                <td><a href=\"{dump_dir}{count_table}-{count_files}-index.txt\">index-{count_table}-{count_files}</a></td>\
                    <td><a href=\"{dump_dir}{count_table}-{count_files}-bigramme.txt\">bigramme-{count_table}-{count_files}</a></td>\
                        <td><a href=\"{context_dir}{count_table}-{count_files}-contexte.txt\">contexte-{count_table}-{count_files}</a></td></tr>""")
    file.close()

################ For NLP 

def get_word_split (text):
    """
    Detects first if the file the text comes from is japanese. 
    If so, segments and tokenizes japanese. 
    Else, uses the nltk tokenizer.  
        Parameter :
            text (str) : need any string
    """
    if re.match(r'jp.*\.txt',file):

        new_text =[]

        tagger = fugashi.Tagger() #the japanese tagger
        jp_text = [token.surface for token in tagger(dump_text)] #x.surface calls the word itself
        for i in jp_text : 
            if i not in punct :
                new_text.append(i)
                
        return new_text

    else : 
        tokenizer = RegexpTokenizer("[\w]+")
        words = tokenizer.tokenize(text)
        words = list(map(lambda w:w.lower(), words))

        return words

def get_corpus (text):
    """
    Gets the result list of the function get_word_split () and 
    returns it into a string.   
    Parameter :
        text (str) : needs any string
    """

    words = get_word_split(text)
    sentences = ""
    for word in words :
        if word not in stop_words:
            sentences = sentences + word + " "
    return sentences

def get_index(words):
    """
    Gets the result list of the function get_word_split () and 
    returns a list of tuples.   
    Parameter :
        text (list) : needs a list of strings 
    """

    occurrences = {}

    tokens = get_word_split(words)
    for token in tokens:
        occurrences.update({
            token : occurrences.get(token, 0)+1
    })

    # list of tuples
    occurrences = sorted(occurrences.items(), key=lambda x:x[0])
    occurrences = sorted(occurrences, key = lambda x:x[1], reverse = True)

    with open(f'{dump_dir}{count_table}-{count_files}-index.txt', 'w', encoding='utf-8') as w :
        for key, value in occurrences:
            w.write(f'{value}\t{key}\n')

    return occurrences

def get_bigram(lst):
    """
    Gets a list of sorted tuples and turns it into bigrams      
    Parameter :
        text (list) : needs a list of sorted tuples
    """
    bigram = list(nltk.bigrams(lst))

    list_bigrams = {}
    for a,b in bigram :
        list_bigrams.update({
            (a[0],b[0]) : list_bigrams.get((a[0],b[0]), 0)+1
        })

    list_bigrams = sorted(list_bigrams.items(), key = lambda x:x[0])
    list_bigrams = sorted(list_bigrams, key = lambda x:x[1], reverse = True)   

    f=open(f'{dump_dir}{count_table}-{count_files}-bigramme.txt', 'w', encoding='utf-8')
    for a,b in list_bigrams : 
        f.write(f'{b}\t{a[0]}\t{a[1]}\n')
    f.close()

    return list_bigrams

def get_context (text, regexp):
    """
    Gets the result list of the function get_word_split () and 
    returns context of a targeted pattern  (5 words before and after the target )
    Parameter :
        text (str) : need any string
        regexp (re.Pattern) : need a result from the re.compile method
    """
    #target_word = r'\b(A|a)lcools?\b'
    #target_word = re.compile(r'(A|a)lcools?')
    f=open(f'{context_dir}{count_table}-{count_files}.txt','w', encoding='utf-8')
    list=get_word_split(text)

    for word in re.finditer(regexp,text):
        try: print(list.index(word.group(0)))
        except ValueError:
            continue

        while list.index(word.group(0)):
            word_index=list.index(word.group(0))
            target_list=list[word_index-5:word_index+5]
            phrase=""

            for i in target_list: 
                phrase=phrase+i+" "
            #print(phrase)
            f.write(f'{phrase}\n')

            list=list[:word_index]+list[word_index+1:]

            try : word_index=list.index(word.group(0))
            except ValueError:
                break
    f.close()

if __name__ == '__main__' : 

    count_table = 0 #start counting the iterated files

    for file in os.listdir(urls_dir) : #iterates in the directory
        #print (file)
        

        if file == '.DS_Store': #in MacOs, directories have a hidden file named ".DS_Store" 
            continue

        count_table += 1 #next iterated file gets a new number
        urls_list = open(f'{urls_dir}{file}', 'r', encoding= 'utf-8') #opens each file
        print (f"fichier lu : {file}")

        start_html() #starts a webpage per language file
        html_table() #start the html table

        count_files = 0 #start counting the iterated urls

        for line in urls_list : #iterates all links in the file
            
            #here, we set back the variables to None
            code_http=None
            url_encoding=None
            dump_text=None
            index=None
            bigram=None
            context = None
            

            count_files += 1 #next iterated file gets a new number
            try : 
                code_http = url_code_http(line)
            except :
                print (f'Problème HTTPResponse avec le lien {count_table}-{count_files}')
                continue_table()
                continue
            
            print(f'Code HTTP de {count_table}-{count_files} : {code_http}')
            print(line)
            
            if code_http == 200 :

                try :
                    get_html_page(line)
                except :
                    print(f"le site {line} ne nous autorise pas à récupérer ses données")
                    code_http= 403 #Code 403 unables our scraper to download data from the website but is given after HTTPResponse 200 
                    url_encoding="-"
                    dump_text="-"
                    index="-"
                    bigram="-"
                    continue_table()
                    continue

                url_encoding = url_char_encoding(line)
                #print (url_encoding)
                url_encoding = url_encoding.upper()

                if url_encoding == 'UTF-8':
                    dump_text = get_dump_text(line)  
                    index = get_index(dump_text)
                    bigram = get_bigram(index)

                    # we change our behavior depending of the name of the file. 
                    # The files'names need to be checked before starting the program
                    if  re.match(r"fr.*\.txt",file) :
                        f=open(f'{source_dir}0.fr_corpus.txt', 'a', encoding='utf-8') 
                        f.write(f'{get_corpus(dump_text)}') #appends each cleaned text into a one and only file 
                        f.close()
                        target_word = re.compile(r'(A|a)lcools?') #targeted_pattern for french
                        context = get_context(dump_text, target_word)
                        continue_table()
                    
                    elif re.match(r"en.*\.txt",file) :
                        f=open(f'{source_dir}0.en_corpus.txt', 'a', encoding='utf-8')
                        f.write(f'{get_corpus(dump_text)}')
                        f.close()
                        target_word = re.compile(r'(A|a)lcohols?') #targeted pattern for english
                        context=get_context(dump_text, target_word)
                        continue_table()

                    elif re.match(r'jp.*\.txt',file):
                        f=open(f'{source_dir}0.jp_corpus.txt', 'a', encoding='utf-8')
                        f.write(f'{get_corpus(dump_text)}')
                        f.close()
                        target_word = re.compile(r'お?酒') #targeted pattern for Japanese
                        context = get_context(dump_text, target_word)
                        continue_table()

                    else : 
                        #if any other file is found , ignores it and continues the process.
                        continue
                else: 
                    #The BS4 Library automatically converts the contents into UTF-8
                    #So there's no need for further code lines. 
                    #But just in case BS4 doesn't catch and converts files.
                    dump_text = get_dump_text(line)  
                    index = get_index(dump_text)
                    bigram = get_bigram(index)

                    if  re.match(r"fr.*\.txt",file) :
                        f=open(f'{source_dir}0.fr_corpus.txt', 'a', encoding='utf-8')
                        f.write(f'{dump_text}')
                        f.close()
                        target_word = re.compile(r'(A|a)lcools?')
                        context = get_context(dump_text, target_word)
                        continue_table()
                    
                    elif re.match(r"en.*\.txt",file) :
                        f=open(f'{source_dir}0.en_corpus.txt', 'a', encoding='utf-8')
                        f.write(f'{get_corpus(dump_text)}')
                        f.close()
                        target_word = re.compile(r'(A|a)lcohols?')
                        context = get_context(dump_text, target_word)
                        continue_table()

                    elif re.match(r'jp.*\.txt',file):
                        f=open(f'0.{source_dir}jp_corpus.txt', 'a', encoding='utf-8')
                        f.write(f'{get_corpus(dump_text)}')
                        f.close()
                        target_word = re.compile(r'お?酒')
                        context = get_context(dump_text, target_word)
                        continue_table()

            elif code_http == 404 : 
                #if a website url is not reachable anymore
                continue_table()
                
            else : #for any other HTTPResponse received, still tries to get the data
                try : #first test to be sure the website doesn't give us an HTTPResponse 403 afterwords
                    get_html_page(line)
                    url_encoding = url_char_encoding(line)
                    #print (url_encoding)
                    url_encoding = url_encoding.upper()
                except : #when catches the error, sets all our variables to "-"
                    print (f"Le lien {line} n'a pas pu être parcouru.")
                    url_encoding="-"
                    dump_text="-"
                    index="-"
                    bigram="-"
                    context="-"
                    continue_table()
                    continue

                try : #then tries to get the language processes to work    
                    if url_encoding == 'UTF-8':
                        dump_text = get_dump_text(line)
                        index = get_index(dump_text)
                        bigram = get_bigram(index)

                    try : # in the processes, try to pursue the context and corpus edition
                        if  re.match(r"fr.*\.txt",file) :
                            f=open(f'{source_dir}0.fr_corpus.txt', 'a', encoding='utf-8')
                            f.write(f'{get_corpus(dump_text)}')
                            f.close()
                            target_word = re.compile(r'(A|a)lcools?')
                            context(dump_text, target_word)
                            continue_table()
                        
                        elif re.match(r"en.*\.txt",file) :
                            f=open(f'{source_dir}0.en_corpus.txt', 'a', encoding='utf-8')
                            f.write(f'{get_corpus(dump_text)}')
                            f.close()
                            target_word = re.compile(r'(A|a)lcohols?')
                            context(dump_text, target_word)
                            continue_table()

                        elif re.match(r'jp.*\.txt',file):
                            f=open(f'{source_dir}0.jp_corpus.txt', 'a', encoding='utf-8')
                            f.write(f'{get_corpus(dump_text)}')
                            f.close()
                            target_word = re.compile(r'お?酒')
                            context(dump_text, target_word)
                            continue_table()
                    except : # if raises exception, sets context to "-"
                        print(f"Le lien {line}a été parcouru, mais le traitement a été interrompu")
                        context = "-"
                        continue

                except : #if raises exceptions, sets all variables to "-"
                    print("le programme n'a pas pu se lancer.")
                    dump_text="-"
                    index="-"
                    bigram="-"
                    context="-"
                    continue
        else : 
            continue

    end_html() #Writes the end of our html page 

    urls_list.close() #close our OS Directory
    
     

Script Python pour segmenter le japonais

import argparse, fugashi, string

######################## Declarations

####### Add arguments to the script 

#Script Description
parser = argparse.ArgumentParser(description= """Script for japanese segmentation""")

#Adds arguments to the script 
parser.add_argument('-f','--file', help = 'Here comes a .txt file with Japanese characters to be segmented',type = str)
parser.add_argument('echo', nargs ='?', help= 'Need some text to be segmented', type = str)

#Commits the arguments above
args = parser.parse_args()

####### Punctuation variables
punct = list(string.punctuation)
punct.append("—")
punct.append("...")
punct.append("–")
punct.append("…")

#Most of the japanese punctuation
jp_punct = (
[chr(i) for i in range(0x3000,0x3041)] + 
[chr(i) for i in range(0x30fb,0x3100)] + 
[chr(i) for i in range(0xff1a,0xff21)] + 
[chr(i) for i in range(0xff3b,0xff41)] +
[chr(i) for i in range(0xff5b,0xff66)]
)
jp_punct.append('(')
jp_punct.append(')')
jp_punct.append('“')
jp_punct.append('”')
jp_punct.append('!')

#all in one declaration pinct
punct = punct + jp_punct

 

#################### Japanese segmentation
tagger = fugashi.Tagger() #the japanese tagger

if args.echo :
    
    jp_text = [word.surface for word in tagger(args.echo)] #x.surface calls the word itself
    new_jp_text = []
    for i in jp_text : 
        if i not in punct :
            new_jp_text.append(i)

    print(*new_jp_text)

if args.file : 
    f = open(f"{args.file}", 'r')
    text = f.read()
    jp_text = [word.surface for word in tagger(text)] #x.surface calls the word itself

    f.close()

if __name__ == '__main__' :

    print (*new_jp_text)