Script Python pour le tableau HTML
#!usr/bin/python3
import re, os, nltk, fugashi, string
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup
import requests
from urllib.request import Request, urlopen
################ Declarations
# List of used directories path
context_dir = "../CONTEXTES/"
dump_dir = "../DUMP-TEXT/"
images_dir = "../IMAGES/"
pages_dir = "../PAGES-ASPIREES/"
tables_dir = "../TABLEAUX/"
urls_dir = "../URLS/"
source_dir ="../V-RESSOURCES/"
# Punctuation variables
punct = list(string.punctuation)
punct.append("—")
punct.append("...")
punct.append("–")
punct.append("…")
#Most of the japanese punctuation
jp_punct = (
[chr(i) for i in range(0x3000,0x3041)] +
[chr(i) for i in range(0x30fb,0x3100)] +
[chr(i) for i in range(0xff1a,0xff21)] +
[chr(i) for i in range(0xff3b,0xff41)] +
[chr(i) for i in range(0xff5b,0xff66)]
)
jp_punct.append('(')
jp_punct.append(')')
jp_punct.append('“')
jp_punct.append('”')
jp_punct.append('!')
punct = punct + jp_punct
#Japanese stop_words list
stop_words = []
with open(f'{source_dir}stopwords_jp.txt', 'r', encoding='utf-8') as f :
reader = f.readlines()
for word in reader :
word = word.strip()
stop_words.append(word)
################ HTTP requests
def scrape_header(url):
"""
Sends my informations to the websites so they know or don't ban my IP from
scraping datas.
Parameter :
url (str) : need an url
"""
headers = {'User-Agent': 'Mozilla/5.0'}
request = Request(url, headers=headers)
return request
def url_code_http (url) :
"""
Gets the http code from a url request
Parameter :
url (str) : needs an url
"""
url = url.replace("\n", "")
info = requests.get(url)
code_http= info.status_code
return code_http
################# Scraping the websites
def open_url(url):
"""
Opens the url and returns the method to process it.
Parameter :
url (str) : need a url
"""
link = scrape_header(url)
html =urlopen(link)
return BeautifulSoup(html, 'lxml')
def get_html_page (url):
"""
Absorbs the entire content of a website on a hardrive
Parameter :
url (bs4.BeautifulSoup) : Need the result of BeautifulSoup
"""
soup = open_url(url)
text_html = soup.prettify()
f=open(f"{pages_dir}/{count_table}-{count_files}.html", 'w', encoding='utf-8')
f.write(text_html)
f.close()
def get_dump_text (url):
"""
Gets all the raw text from a html file into a text file.
Parameter :
url (bs4.BeautifulSoup) : Need the result of BeautifulSoup
"""
soup = open_url(url)
dump_text = soup.get_text()
w=open(f"{dump_dir}{count_table}-{count_files}.txt",'w', encoding='utf-8')
w.write(dump_text)
w.close()
return dump_text
def url_char_encoding (url):
"""
Gets the original encoding from the website (written in the html)
as BeautifulSoup converts automatically texts into utf-8.
Parameter :
url (bs4.BeautifulSoup) : Need the result of BeautifulSoup
"""
soup= open_url(url)
return soup.original_encoding
################# This part builds a html table
################# The function continue_table() retrieves all the informations from this program other functions
def start_html():
file=open(f"{tables_dir}tableau-{count_table}.html", 'w', encoding='utf-8')
file.write( """<!DOCTYPE html>\n
<html>\n
<head>\n
<title>Projet sur le Web : Alcool</title>\n
<meta charset = 'utf-8'/>
</head>\n
<body>\n\t""")
file.close()
def end_html():
file=open(f"{tables_dir}tableau-{count_table}.html", 'a', encoding='utf-8')
file.write( """</body>\n
</html>""")
file.close()
def html_table():
file=open(f"{tables_dir}tableau-{count_table}.html", 'a', encoding='utf-8')
file.write( """<table align=\"center\" border=\"1px\" bordercolor=blue>\n
<tr><td>N°</td><td>http-code</td><td>Encodage</td><td>URL</td>\
<td>Page aspirée</td><td>Filtrage html</td><td>Index</td><td>Bigramme</td><td>Contexte</td></tr>""")
file.close()
def continue_table():
file=open(f"{tables_dir}tableau-{count_table}.html", 'a', encoding='utf-8')
file.write( f"""<tr><td>{count_files}</td><td>{code_http}</td><td>{url_encoding}</td>\
<td><a href=\"{line}\">{line}</a></td><td><a href=\"{pages_dir}/{count_table}-{count_files}.html\">{count_table}-{count_files}</a></td>\
<td><a href=\"{dump_dir}{count_table}-{count_files}.txt\">{count_table}-{count_files}</a></td>\
<td><a href=\"{dump_dir}{count_table}-{count_files}-index.txt\">index-{count_table}-{count_files}</a></td>\
<td><a href=\"{dump_dir}{count_table}-{count_files}-bigramme.txt\">bigramme-{count_table}-{count_files}</a></td>\
<td><a href=\"{context_dir}{count_table}-{count_files}-contexte.txt\">contexte-{count_table}-{count_files}</a></td></tr>""")
file.close()
################ For NLP
def get_word_split (text):
"""
Detects first if the file the text comes from is japanese.
If so, segments and tokenizes japanese.
Else, uses the nltk tokenizer.
Parameter :
text (str) : need any string
"""
if re.match(r'jp.*\.txt',file):
new_text =[]
tagger = fugashi.Tagger() #the japanese tagger
jp_text = [token.surface for token in tagger(dump_text)] #x.surface calls the word itself
for i in jp_text :
if i not in punct :
new_text.append(i)
return new_text
else :
tokenizer = RegexpTokenizer("[\w]+")
words = tokenizer.tokenize(text)
words = list(map(lambda w:w.lower(), words))
return words
def get_corpus (text):
"""
Gets the result list of the function get_word_split () and
returns it into a string.
Parameter :
text (str) : needs any string
"""
words = get_word_split(text)
sentences = ""
for word in words :
if word not in stop_words:
sentences = sentences + word + " "
return sentences
def get_index(words):
"""
Gets the result list of the function get_word_split () and
returns a list of tuples.
Parameter :
text (list) : needs a list of strings
"""
occurrences = {}
tokens = get_word_split(words)
for token in tokens:
occurrences.update({
token : occurrences.get(token, 0)+1
})
# list of tuples
occurrences = sorted(occurrences.items(), key=lambda x:x[0])
occurrences = sorted(occurrences, key = lambda x:x[1], reverse = True)
with open(f'{dump_dir}{count_table}-{count_files}-index.txt', 'w', encoding='utf-8') as w :
for key, value in occurrences:
w.write(f'{value}\t{key}\n')
return occurrences
def get_bigram(lst):
"""
Gets a list of sorted tuples and turns it into bigrams
Parameter :
text (list) : needs a list of sorted tuples
"""
bigram = list(nltk.bigrams(lst))
list_bigrams = {}
for a,b in bigram :
list_bigrams.update({
(a[0],b[0]) : list_bigrams.get((a[0],b[0]), 0)+1
})
list_bigrams = sorted(list_bigrams.items(), key = lambda x:x[0])
list_bigrams = sorted(list_bigrams, key = lambda x:x[1], reverse = True)
f=open(f'{dump_dir}{count_table}-{count_files}-bigramme.txt', 'w', encoding='utf-8')
for a,b in list_bigrams :
f.write(f'{b}\t{a[0]}\t{a[1]}\n')
f.close()
return list_bigrams
def get_context (text, regexp):
"""
Gets the result list of the function get_word_split () and
returns context of a targeted pattern (5 words before and after the target )
Parameter :
text (str) : need any string
regexp (re.Pattern) : need a result from the re.compile method
"""
#target_word = r'\b(A|a)lcools?\b'
#target_word = re.compile(r'(A|a)lcools?')
f=open(f'{context_dir}{count_table}-{count_files}.txt','w', encoding='utf-8')
list=get_word_split(text)
for word in re.finditer(regexp,text):
try: print(list.index(word.group(0)))
except ValueError:
continue
while list.index(word.group(0)):
word_index=list.index(word.group(0))
target_list=list[word_index-5:word_index+5]
phrase=""
for i in target_list:
phrase=phrase+i+" "
#print(phrase)
f.write(f'{phrase}\n')
list=list[:word_index]+list[word_index+1:]
try : word_index=list.index(word.group(0))
except ValueError:
break
f.close()
if __name__ == '__main__' :
count_table = 0 #start counting the iterated files
for file in os.listdir(urls_dir) : #iterates in the directory
#print (file)
if file == '.DS_Store': #in MacOs, directories have a hidden file named ".DS_Store"
continue
count_table += 1 #next iterated file gets a new number
urls_list = open(f'{urls_dir}{file}', 'r', encoding= 'utf-8') #opens each file
print (f"fichier lu : {file}")
start_html() #starts a webpage per language file
html_table() #start the html table
count_files = 0 #start counting the iterated urls
for line in urls_list : #iterates all links in the file
#here, we set back the variables to None
code_http=None
url_encoding=None
dump_text=None
index=None
bigram=None
context = None
count_files += 1 #next iterated file gets a new number
try :
code_http = url_code_http(line)
except :
print (f'Problème HTTPResponse avec le lien {count_table}-{count_files}')
continue_table()
continue
print(f'Code HTTP de {count_table}-{count_files} : {code_http}')
print(line)
if code_http == 200 :
try :
get_html_page(line)
except :
print(f"le site {line} ne nous autorise pas à récupérer ses données")
code_http= 403 #Code 403 unables our scraper to download data from the website but is given after HTTPResponse 200
url_encoding="-"
dump_text="-"
index="-"
bigram="-"
continue_table()
continue
url_encoding = url_char_encoding(line)
#print (url_encoding)
url_encoding = url_encoding.upper()
if url_encoding == 'UTF-8':
dump_text = get_dump_text(line)
index = get_index(dump_text)
bigram = get_bigram(index)
# we change our behavior depending of the name of the file.
# The files'names need to be checked before starting the program
if re.match(r"fr.*\.txt",file) :
f=open(f'{source_dir}0.fr_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}') #appends each cleaned text into a one and only file
f.close()
target_word = re.compile(r'(A|a)lcools?') #targeted_pattern for french
context = get_context(dump_text, target_word)
continue_table()
elif re.match(r"en.*\.txt",file) :
f=open(f'{source_dir}0.en_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}')
f.close()
target_word = re.compile(r'(A|a)lcohols?') #targeted pattern for english
context=get_context(dump_text, target_word)
continue_table()
elif re.match(r'jp.*\.txt',file):
f=open(f'{source_dir}0.jp_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}')
f.close()
target_word = re.compile(r'お?酒') #targeted pattern for Japanese
context = get_context(dump_text, target_word)
continue_table()
else :
#if any other file is found , ignores it and continues the process.
continue
else:
#The BS4 Library automatically converts the contents into UTF-8
#So there's no need for further code lines.
#But just in case BS4 doesn't catch and converts files.
dump_text = get_dump_text(line)
index = get_index(dump_text)
bigram = get_bigram(index)
if re.match(r"fr.*\.txt",file) :
f=open(f'{source_dir}0.fr_corpus.txt', 'a', encoding='utf-8')
f.write(f'{dump_text}')
f.close()
target_word = re.compile(r'(A|a)lcools?')
context = get_context(dump_text, target_word)
continue_table()
elif re.match(r"en.*\.txt",file) :
f=open(f'{source_dir}0.en_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}')
f.close()
target_word = re.compile(r'(A|a)lcohols?')
context = get_context(dump_text, target_word)
continue_table()
elif re.match(r'jp.*\.txt',file):
f=open(f'0.{source_dir}jp_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}')
f.close()
target_word = re.compile(r'お?酒')
context = get_context(dump_text, target_word)
continue_table()
elif code_http == 404 :
#if a website url is not reachable anymore
continue_table()
else : #for any other HTTPResponse received, still tries to get the data
try : #first test to be sure the website doesn't give us an HTTPResponse 403 afterwords
get_html_page(line)
url_encoding = url_char_encoding(line)
#print (url_encoding)
url_encoding = url_encoding.upper()
except : #when catches the error, sets all our variables to "-"
print (f"Le lien {line} n'a pas pu être parcouru.")
url_encoding="-"
dump_text="-"
index="-"
bigram="-"
context="-"
continue_table()
continue
try : #then tries to get the language processes to work
if url_encoding == 'UTF-8':
dump_text = get_dump_text(line)
index = get_index(dump_text)
bigram = get_bigram(index)
try : # in the processes, try to pursue the context and corpus edition
if re.match(r"fr.*\.txt",file) :
f=open(f'{source_dir}0.fr_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}')
f.close()
target_word = re.compile(r'(A|a)lcools?')
context(dump_text, target_word)
continue_table()
elif re.match(r"en.*\.txt",file) :
f=open(f'{source_dir}0.en_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}')
f.close()
target_word = re.compile(r'(A|a)lcohols?')
context(dump_text, target_word)
continue_table()
elif re.match(r'jp.*\.txt',file):
f=open(f'{source_dir}0.jp_corpus.txt', 'a', encoding='utf-8')
f.write(f'{get_corpus(dump_text)}')
f.close()
target_word = re.compile(r'お?酒')
context(dump_text, target_word)
continue_table()
except : # if raises exception, sets context to "-"
print(f"Le lien {line}a été parcouru, mais le traitement a été interrompu")
context = "-"
continue
except : #if raises exceptions, sets all variables to "-"
print("le programme n'a pas pu se lancer.")
dump_text="-"
index="-"
bigram="-"
context="-"
continue
else :
continue
end_html() #Writes the end of our html page
urls_list.close() #close our OS Directory
Script Python pour segmenter le japonais
import argparse, fugashi, string
######################## Declarations
####### Add arguments to the script
#Script Description
parser = argparse.ArgumentParser(description= """Script for japanese segmentation""")
#Adds arguments to the script
parser.add_argument('-f','--file', help = 'Here comes a .txt file with Japanese characters to be segmented',type = str)
parser.add_argument('echo', nargs ='?', help= 'Need some text to be segmented', type = str)
#Commits the arguments above
args = parser.parse_args()
####### Punctuation variables
punct = list(string.punctuation)
punct.append("—")
punct.append("...")
punct.append("–")
punct.append("…")
#Most of the japanese punctuation
jp_punct = (
[chr(i) for i in range(0x3000,0x3041)] +
[chr(i) for i in range(0x30fb,0x3100)] +
[chr(i) for i in range(0xff1a,0xff21)] +
[chr(i) for i in range(0xff3b,0xff41)] +
[chr(i) for i in range(0xff5b,0xff66)]
)
jp_punct.append('(')
jp_punct.append(')')
jp_punct.append('“')
jp_punct.append('”')
jp_punct.append('!')
#all in one declaration pinct
punct = punct + jp_punct
#################### Japanese segmentation
tagger = fugashi.Tagger() #the japanese tagger
if args.echo :
jp_text = [word.surface for word in tagger(args.echo)] #x.surface calls the word itself
new_jp_text = []
for i in jp_text :
if i not in punct :
new_jp_text.append(i)
print(*new_jp_text)
if args.file :
f = open(f"{args.file}", 'r')
text = f.read()
jp_text = [word.surface for word in tagger(text)] #x.surface calls the word itself
f.close()
if __name__ == '__main__' :
print (*new_jp_text)