Commits on Source (2)
# Comment to test branches
# Definition of classes and methods associated # Definition of classes and methods associated
class Document: class Document:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
Created on Mon Mar 25 13:04:45 2019
@author: luissalamanca
import gensim
import os
import copy
import smart_open
import random
import time
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import utils_proc as ut_p
import utils_annot as ut_a
import xml.etree.ElementTree as ET
from tmtoolkit.preprocess import TMPreproc
from scipy import linalg
import itertools
import matplotlib as mpl
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
import goslate
gs = goslate.Goslate()
color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
def plot_results(X, Y_, means, covariances, title):
splot = plt.subplot(1, 1, 1)
for i, (mean, covar, color) in enumerate(zip(
means, covariances, color_iter)):
v, w = linalg.eigh(covar)
v = 2. * np.sqrt(2.) * np.sqrt(v)
u = w[0] / linalg.norm(w[0])
# as the DP will not use every component it has access to
# unless it needs it, we shouldn't plot the redundant
# components.
if not np.any(Y_ == i):
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan(u[1] / u[0])
angle = 180. * angle / np.pi # convert to degrees
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
#plt.xlim(-9., 5.)
#plt.ylim(-3., 6.)
def train_doc2vec(train_corpus, vector_size=100, min_count=5, epochs=40):
model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs)
st_t = time.time()
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)
print('Time training %f' % (time.time() - st_t))
return model
# 18 seconds for 8220 paragraphs and 13k words for the vocabulary, and 100 dimensiones
# 127 seconds for 35995 paragraphs, corresponding only to german, and 100 dim
def feat_tsne(model):
feat_mat_doc = np.zeros((model.docvecs.count,model.vector_size))
for i_c in np.arange(model.docvecs.count):
feat_mat_doc[i_c,:] = model.docvecs[i_c]
st_t = time.time()
feat_mat_doc_embed = TSNE(n_components = 2).fit_transform(feat_mat_doc)
print('Time %f' % (time.time() - st_t))
return feat_mat_doc, feat_mat_doc_embed
# 1562 seconds for 35995 parag and 100 dim
def scatter_lang(feat_mat_doc_embed, lang_use, list_lang):
plt.figure(figsize=(40, 20))
if lang_use == 'all':
ind_c = np.zeros(len(list_lang))
for i_l, lang in enumerate(np.unique(np.array(list_lang))):
ind_p = np.argwhere(np.array(list_lang) == lang)
if len(ind_p):
ind_c[ind_p] = i_l
plt.scatter(feat_mat_doc_embed[ind_p,0], feat_mat_doc_embed[ind_p,1], label = lang, alpha = 0.6)
plt.scatter(feat_mat_doc_embed[:,0], feat_mat_doc_embed[:,1], alpha = 0.6)
def fit_gmm_plot(feat_mat, n_comp = 10, cov_t = 'full', flag_plot = 0):
dpgmm = mixture.BayesianGaussianMixture(n_components = n_comp,
covariance_type = cov_t).fit(feat_mat)
res_pred = np.array(dpgmm.predict(feat_mat))
if flag_plot:
plot_results(feat_mat, res_pred, dpgmm.means_, dpgmm.covariances_,
'Bayesian Gaussian Mixture with a Dirichlet process prior')
return dpgmm, res_pred
def scatter_classes(feat_mat_embed, res_pred):
plt.figure(figsize=(40, 20))
for i_l, clust in enumerate(np.unique(res_pred)):
ind_p = np.argwhere(res_pred == clust)
if len(ind_p):
plt.scatter(feat_mat_embed[ind_p,0], feat_mat_embed[ind_p,1], label = clust, alpha = 0.6)
def write_to_txt(text_par_dict, res_pred, str_ex = '', folder_res = '../../../'):
folder_res = '../../../'
for clust in np.unique(res_pred):
fp_o = open(folder_res + 'Text_in_Clust' + str(clust) + str_ex + '.txt', 'w+')
ind_p = np.argwhere(res_pred == clust)
for i_p in ind_p:
fp_o.write(text_par_dict[int(i_p)] + '\n')
def rem_clusters(cl_keep, train_corpus, list_lang, text_par_dict):
train_corpus_red = list()
list_lang_red = list()
text_par_dict_red = dict()
count = 0
for i_c in range(len(train_corpus)):
if res_pred[i_c] in cl_keep:
aux_in = gensim.models.doc2vec.TaggedDocument(train_corpus[i_c][0], [count])
text_par_dict_red[count] = text_par_dict[int(i_c)]
count += 1
return train_corpus_red, list_lang_red, text_par_dict_red
# In this function I am using paragraph embeddings to classify the different
# sections in the documents. In principle, we should have the following: laws,
# votes and speeches. Though there might probably exist much more, with more
# subtle differences, like amendments, etc.
folder_database = '../../data/AB/'
years = np.arange(1891,1899)
name_meta = '01_rawmeta'
name_outcorrxml = '04_correctedxml'
# Generate initial corpus
train_corpus = list()
list_lang = list()
count_par = 0
lang_use = 'all'
flag_lemma = 0
flag_byblock = 1
for year in years:
print('Year: %d' % year)
files, list_ids = ut_p.get_list(year, folder_database, name_outcorrxml)
for i_file in range(len(files)):
name_xml = files[i_file]
name_xml_meta = name_xml.split('_')[0] + '.xml'
path_meta_xml_file = ut_p.get_handlerfile(name_xml_meta, folder_database, name_meta)
disc_flag = ut_a.check_if_discussion(path_meta_xml_file)
if disc_flag:
h_xml = ut_p.get_handlerfile(name_xml, folder_database, name_outcorrxml)
XML_tree = ET.parse(h_xml)
XML_main = XML_tree.getroot()
not_end = 1
i_p = 0; i_t = 0
while not_end:
# for i_p in range(len(XML_main)):
# for i_t in range(len(XML_main[i_p])):
# We group by blocks
text_par = ''
if flag_byblock:
flag_block = 1
if XML_main[i_p][i_t].tag == 'textbox':
id_block = XML_main[i_p][i_t].attrib['block']
while flag_block:
if XML_main[i_p][i_t].tag == 'textbox':
if XML_main[i_p][i_t].attrib['block'] == id_block:
type_t = ut_a.get_textbox_type(XML_main[i_p][i_t])
if type_t in ('text_col1','text_col2'):
text_par = text_par + ' ' + ut_a.get_complete_text(XML_main[i_p][i_t])[0]
flag_block = 0
flag_block = 0
i_t += 1
if i_t == len(XML_main[i_p]):
i_p += 1; i_t = 0
if i_p == len(XML_main):
flag_block = 0; not_end = 0
if XML_main[i_p][i_t].tag == 'textbox':
type_t = ut_a.get_textbox_type(XML_main[i_p][i_t])
if type_t in ('text_col1','text_col2'):
text_par = ut_a.get_complete_text(XML_main[i_p][i_t])[0]
i_t += 1
if i_t == len(XML_main[i_p]):
i_p += 1; i_t = 0
if i_p == len(XML_main):
not_end = 0
# Perform stemmitazion
if len(text_par.strip(' ')):
dict_lang = ut_a.identify_language(text_par)
label_language = ut_a.label_language_simple(dict_lang)
if lang_use == 'all':
train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text_par), [count_par]))
count_par += 1
if label_language == lang_use:
if flag_lemma:
name_k = 'doc' + str(count_par)
text_par_dict[name_k] = text_par
#text_par_d = {'doc1': text_par}
#preproc = TMPreproc(text_par_d, language = lang_use)
#st_t = time.time()
#print('Time lemmatize %f' % (time.time() - st_t))
#train_corpus.append(gensim.models.doc2vec.TaggedDocument(list(preproc.tokens['doc1']), [count_par]))
train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text_par), [count_par]))
count_par += 1
if flag_lemma:
preproc = TMPreproc(text_par_dict, language = lang_use)
st_t = time.time()
print('Time lemmatize %f' % (time.time() - st_t))
st_t = time.time()
[train_corpus.append(gensim.models.doc2vec.TaggedDocument(preproc.tokens['doc' + str(i_c)], [i_c])) for i_c in range(len(text_par_dict))]
print('Build train corpus %f' % (time.time() - st_t))
# First iteration
model = train_doc2vec(train_corpus, vector_size=50, min_count=5)
feat_mat_doc, feat_mat_doc_embed = feat_tsne(model)
scatter_lang(feat_mat_doc_embed, lang_use, list_lang)
dpgmm, res_pred = fit_gmm_plot(feat_mat_doc, n_comp = 10, cov_t = 'full', flag_plot = 0)
scatter_classes(feat_mat_doc_embed, res_pred)
write_to_txt(text_par_dict, res_pred)
# Second iteration
train_corpus_red1, list_lang_red1, text_par_dict_red1 = rem_clusters([4,7], train_corpus, list_lang, text_par_dict)
model_red1 = train_doc2vec(train_corpus_red1, vector_size=100, min_count=5)
feat_mat_doc_red1, feat_mat_doc_embed_red1 = feat_tsne(model_red1)
scatter_lang(feat_mat_doc_embed_red1, lang_use, list_lang_red1)
dpgmm_red1, res_pred_red1 = fit_gmm_plot(feat_mat_doc_red1, n_comp = 10, cov_t = 'diag', flag_plot = 0)
scatter_classes(feat_mat_doc_embed_red1, res_pred_red1)
write_to_txt(text_par_dict_red1, res_pred_red1, str_ex = '_Red1')
ind_f = np.argwhere(np.array(list_lang_red1) == 'french')
from googletrans import Translator
translator = Translator()
# <Translated src=ko dest=en text=Good evening. pronunciation=Good evening.>
translator.translate(text_par_dict_red1[int(ind_f[2])], dest='de')
# <Translated src=ko dest=ja text=こんにちは。 pronunciation=Kon'nichiwa.>
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'
# smart_open can be used with really long files in an optimal way, as it
# streams the data
def read_corpus(fname, tokens_only=False):
with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
for i, line in enumerate(f):
if tokens_only:
yield gensim.utils.simple_preprocess(line)
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
# yield works like return, but with iterables elements. It runs the function
# until yield, return the element, and run it again. This until the for has
# been emptied
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
# Training
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
st_t = time.time()
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print('Time %f' % (time.time() - st_t))
# Get all word vectors, and plot embeddings
vocab = list(model.wv.vocab)
feat_mat = np.zeros((len(vocab),model.vector_size))
for i_c, word in enumerate(vocab):
feat_mat[i_c,:] = model.wv.get_vector(word)
feat_mat_embed = TSNE(n_components = 2).fit_transform(feat_mat)
n_words = 2000
plt.figure(figsize=(40, 20))
sca = plt.scatter(feat_mat_embed[:,0], feat_mat_embed[:,1])
ind_rand = np.random.permutation(len(vocab))
for i_w in ind_rand[:n_words]:
plt.text(feat_mat_embed[i_w,0], feat_mat_embed[i_w,1], vocab[i_w])
# Commands
# get similar words: model.wv.most_similar('war', topn = 10)
# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
rank = [docid for docid, sim in sims].index(doc_id)
# Mapping of doc vecs
feat_mat_doc = np.zeros((model.docvecs.count,model.vector_size))
for i_c in np.arange(model.docvecs.count):
feat_mat_doc[i_c,:] = model.docvecs[i_c]
feat_mat_doc_embed = TSNE(n_components = 2).fit_transform(feat_mat_doc)
n_docs = 300
plt.figure(figsize=(40, 20))
sca = plt.scatter(feat_mat_doc_embed[:,0], feat_mat_doc_embed[:,1])
\ No newline at end of file
return XML_new return XML_new
# Simply, given the number of ocurrences of the stopwords, it assigns a label
# to a specific textbox, also considering the possibility of textboxes
# mixing languages. For this case, the value ratio_similar is intended
# input:
# - aux_dict_l: corresponds to dict_language_counts
# output:
# - lang_max: string
def label_language_simple(aux_dict_l):
# specify a similarity ratio
ratio_similar = 0.8
# if there are counts, determine language
if sum(aux_dict_l.values()):
aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()}
lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key]))
lang_max = ''
count_l = 0
for lang in aux_dict_l_norm.keys():
if (aux_dict_l_norm[lang] > aux_dict_l_norm[lang_max_aux] * ratio_similar):
if count_l > 0:
lang_max += '_'
lang_max += lang
count_l += 1
if count_l > 1:
lang_max = 'mixed_' + lang_max
lang_max = 'languageNotIdentified'
return lang_max
def get_cities(list_citizenship): def get_cities(list_citizenship):
return [city[:-5] for item in list_citizenship for city in item.split(',')] return [city[:-5] for item in list_citizenship for city in item.split(',')]