Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • luis.salamanca/democrasci_preprocwp1
  • rok.roskar/democrasci_preprocwp1
  • marta.paula.balode/democrasci_preprocwp1
  • lorenzo.cavazzi.tech/democrasci-preprocwp1
4 results
Show changes
Commits on Source (2)
...@@ -29,7 +29,7 @@ import preproc_docs ...@@ -29,7 +29,7 @@ import preproc_docs
# Comment to test branches
# Definition of classes and methods associated # Definition of classes and methods associated
class Document: class Document:
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 13:04:45 2019
@author: luissalamanca
"""
import gensim
import os
import copy
import smart_open
import random
import time
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
#sys.path.append('./src/python/')
import utils_proc as ut_p
import utils_annot as ut_a
import xml.etree.ElementTree as ET
from tmtoolkit.preprocess import TMPreproc
from scipy import linalg
import itertools
import matplotlib as mpl
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
import goslate
gs = goslate.Goslate()
#%%
color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
'darkorange'])
def plot_results(X, Y_, means, covariances, title):
splot = plt.subplot(1, 1, 1)
for i, (mean, covar, color) in enumerate(zip(
means, covariances, color_iter)):
v, w = linalg.eigh(covar)
v = 2. * np.sqrt(2.) * np.sqrt(v)
u = w[0] / linalg.norm(w[0])
# as the DP will not use every component it has access to
# unless it needs it, we shouldn't plot the redundant
# components.
if not np.any(Y_ == i):
continue
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan(u[1] / u[0])
angle = 180. * angle / np.pi # convert to degrees
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(0.5)
splot.add_artist(ell)
#plt.xlim(-9., 5.)
#plt.ylim(-3., 6.)
plt.xticks(())
plt.yticks(())
plt.title(title)
def train_doc2vec(train_corpus, vector_size=100, min_count=5, epochs=40):
model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs)
model.build_vocab(train_corpus)
st_t = time.time()
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)
print('Time training %f' % (time.time() - st_t))
return model
# 18 seconds for 8220 paragraphs and 13k words for the vocabulary, and 100 dimensiones
# 127 seconds for 35995 paragraphs, corresponding only to german, and 100 dim
def feat_tsne(model):
feat_mat_doc = np.zeros((model.docvecs.count,model.vector_size))
for i_c in np.arange(model.docvecs.count):
feat_mat_doc[i_c,:] = model.docvecs[i_c]
st_t = time.time()
feat_mat_doc_embed = TSNE(n_components = 2).fit_transform(feat_mat_doc)
print('Time %f' % (time.time() - st_t))
return feat_mat_doc, feat_mat_doc_embed
# 1562 seconds for 35995 parag and 100 dim
def scatter_lang(feat_mat_doc_embed, lang_use, list_lang):
plt.figure(figsize=(40, 20))
if lang_use == 'all':
ind_c = np.zeros(len(list_lang))
for i_l, lang in enumerate(np.unique(np.array(list_lang))):
ind_p = np.argwhere(np.array(list_lang) == lang)
if len(ind_p):
ind_c[ind_p] = i_l
plt.scatter(feat_mat_doc_embed[ind_p,0], feat_mat_doc_embed[ind_p,1], label = lang, alpha = 0.6)
else:
plt.scatter(feat_mat_doc_embed[:,0], feat_mat_doc_embed[:,1], alpha = 0.6)
#plt.colorbar(sca)
plt.box(False)
plt.legend()
plt.xticks([])
plt.yticks([])
plt.tight_layout()
def fit_gmm_plot(feat_mat, n_comp = 10, cov_t = 'full', flag_plot = 0):
dpgmm = mixture.BayesianGaussianMixture(n_components = n_comp,
covariance_type = cov_t).fit(feat_mat)
res_pred = np.array(dpgmm.predict(feat_mat))
if flag_plot:
plot_results(feat_mat, res_pred, dpgmm.means_, dpgmm.covariances_,
'Bayesian Gaussian Mixture with a Dirichlet process prior')
plt.show()
return dpgmm, res_pred
def scatter_classes(feat_mat_embed, res_pred):
plt.figure(figsize=(40, 20))
for i_l, clust in enumerate(np.unique(res_pred)):
ind_p = np.argwhere(res_pred == clust)
if len(ind_p):
plt.scatter(feat_mat_embed[ind_p,0], feat_mat_embed[ind_p,1], label = clust, alpha = 0.6)
plt.box(False)
plt.legend()
plt.xticks([])
plt.yticks([])
plt.tight_layout()
def write_to_txt(text_par_dict, res_pred, str_ex = '', folder_res = '../../../'):
folder_res = '../../../'
for clust in np.unique(res_pred):
fp_o = open(folder_res + 'Text_in_Clust' + str(clust) + str_ex + '.txt', 'w+')
ind_p = np.argwhere(res_pred == clust)
for i_p in ind_p:
fp_o.write(text_par_dict[int(i_p)] + '\n')
fp_o.write('\n')
fp_o.close()
def rem_clusters(cl_keep, train_corpus, list_lang, text_par_dict):
train_corpus_red = list()
list_lang_red = list()
text_par_dict_red = dict()
count = 0
for i_c in range(len(train_corpus)):
if res_pred[i_c] in cl_keep:
aux_in = gensim.models.doc2vec.TaggedDocument(train_corpus[i_c][0], [count])
train_corpus_red.append(aux_in)
list_lang_red.append(list_lang[i_c])
text_par_dict_red[count] = text_par_dict[int(i_c)]
count += 1
return train_corpus_red, list_lang_red, text_par_dict_red
#%%
# In this function I am using paragraph embeddings to classify the different
# sections in the documents. In principle, we should have the following: laws,
# votes and speeches. Though there might probably exist much more, with more
# subtle differences, like amendments, etc.
folder_database = '../../data/AB/'
years = np.arange(1891,1899)
name_meta = '01_rawmeta'
name_outcorrxml = '04_correctedxml'
#%%
# Generate initial corpus
train_corpus = list()
list_lang = list()
count_par = 0
lang_use = 'all'
flag_lemma = 0
text_par_dict=dict()
flag_byblock = 1
for year in years:
print('Year: %d' % year)
files, list_ids = ut_p.get_list(year, folder_database, name_outcorrxml)
for i_file in range(len(files)):
name_xml = files[i_file]
name_xml_meta = name_xml.split('_')[0] + '.xml'
path_meta_xml_file = ut_p.get_handlerfile(name_xml_meta, folder_database, name_meta)
disc_flag = ut_a.check_if_discussion(path_meta_xml_file)
if disc_flag:
h_xml = ut_p.get_handlerfile(name_xml, folder_database, name_outcorrxml)
XML_tree = ET.parse(h_xml)
XML_main = XML_tree.getroot()
not_end = 1
i_p = 0; i_t = 0
while not_end:
# for i_p in range(len(XML_main)):
# for i_t in range(len(XML_main[i_p])):
# We group by blocks
text_par = ''
if flag_byblock:
flag_block = 1
if XML_main[i_p][i_t].tag == 'textbox':
id_block = XML_main[i_p][i_t].attrib['block']
while flag_block:
if XML_main[i_p][i_t].tag == 'textbox':
if XML_main[i_p][i_t].attrib['block'] == id_block:
type_t = ut_a.get_textbox_type(XML_main[i_p][i_t])
if type_t in ('text_col1','text_col2'):
text_par = text_par + ' ' + ut_a.get_complete_text(XML_main[i_p][i_t])[0]
else:
flag_block = 0
else:
flag_block = 0
i_t += 1
if i_t == len(XML_main[i_p]):
i_p += 1; i_t = 0
if i_p == len(XML_main):
flag_block = 0; not_end = 0
else:
if XML_main[i_p][i_t].tag == 'textbox':
type_t = ut_a.get_textbox_type(XML_main[i_p][i_t])
if type_t in ('text_col1','text_col2'):
text_par = ut_a.get_complete_text(XML_main[i_p][i_t])[0]
i_t += 1
if i_t == len(XML_main[i_p]):
i_p += 1; i_t = 0
if i_p == len(XML_main):
not_end = 0
# Perform stemmitazion
if len(text_par.strip(' ')):
dict_lang = ut_a.identify_language(text_par)
label_language = ut_a.label_language_simple(dict_lang)
list_lang.append(label_language)
if lang_use == 'all':
train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text_par), [count_par]))
text_par_dict[count_par]=text_par
count_par += 1
else:
if label_language == lang_use:
if flag_lemma:
name_k = 'doc' + str(count_par)
text_par_dict[name_k] = text_par
#text_par_d = {'doc1': text_par}
#preproc = TMPreproc(text_par_d, language = lang_use)
#st_t = time.time()
#preproc.tokenize().pos_tag().lemmatize()
#print('Time lemmatize %f' % (time.time() - st_t))
#train_corpus.append(gensim.models.doc2vec.TaggedDocument(list(preproc.tokens['doc1']), [count_par]))
else:
train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text_par), [count_par]))
count_par += 1
if flag_lemma:
preproc = TMPreproc(text_par_dict, language = lang_use)
st_t = time.time()
preproc.tokenize().pos_tag().lemmatize()
print('Time lemmatize %f' % (time.time() - st_t))
st_t = time.time()
[train_corpus.append(gensim.models.doc2vec.TaggedDocument(preproc.tokens['doc' + str(i_c)], [i_c])) for i_c in range(len(text_par_dict))]
print('Build train corpus %f' % (time.time() - st_t))
#%%
# First iteration
model = train_doc2vec(train_corpus, vector_size=50, min_count=5)
feat_mat_doc, feat_mat_doc_embed = feat_tsne(model)
scatter_lang(feat_mat_doc_embed, lang_use, list_lang)
dpgmm, res_pred = fit_gmm_plot(feat_mat_doc, n_comp = 10, cov_t = 'full', flag_plot = 0)
scatter_classes(feat_mat_doc_embed, res_pred)
#%%
write_to_txt(text_par_dict, res_pred)
#%%
# Second iteration
train_corpus_red1, list_lang_red1, text_par_dict_red1 = rem_clusters([4,7], train_corpus, list_lang, text_par_dict)
model_red1 = train_doc2vec(train_corpus_red1, vector_size=100, min_count=5)
feat_mat_doc_red1, feat_mat_doc_embed_red1 = feat_tsne(model_red1)
scatter_lang(feat_mat_doc_embed_red1, lang_use, list_lang_red1)
dpgmm_red1, res_pred_red1 = fit_gmm_plot(feat_mat_doc_red1, n_comp = 10, cov_t = 'diag', flag_plot = 0)
scatter_classes(feat_mat_doc_embed_red1, res_pred_red1)
#%%
write_to_txt(text_par_dict_red1, res_pred_red1, str_ex = '_Red1')
#%%
ind_f = np.argwhere(np.array(list_lang_red1) == 'french')
from googletrans import Translator
translator = Translator()
# <Translated src=ko dest=en text=Good evening. pronunciation=Good evening.>
translator.translate(text_par_dict_red1[int(ind_f[2])], dest='de')
# <Translated src=ko dest=ja text=こんにちは。 pronunciation=Kon'nichiwa.>
#%%
'''
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'
#%%
# smart_open can be used with really long files in an optimal way, as it
# streams the data
def read_corpus(fname, tokens_only=False):
with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
for i, line in enumerate(f):
if tokens_only:
yield gensim.utils.simple_preprocess(line)
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
# yield works like return, but with iterables elements. It runs the function
# until yield, return the element, and run it again. This until the for has
# been emptied
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
#%%
# Training
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
st_t = time.time()
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print('Time %f' % (time.time() - st_t))
#%%
# Get all word vectors, and plot embeddings
vocab = list(model.wv.vocab)
feat_mat = np.zeros((len(vocab),model.vector_size))
for i_c, word in enumerate(vocab):
feat_mat[i_c,:] = model.wv.get_vector(word)
feat_mat_embed = TSNE(n_components = 2).fit_transform(feat_mat)
#%%
n_words = 2000
plt.figure(figsize=(40, 20))
sca = plt.scatter(feat_mat_embed[:,0], feat_mat_embed[:,1])
ind_rand = np.random.permutation(len(vocab))
for i_w in ind_rand[:n_words]:
plt.text(feat_mat_embed[i_w,0], feat_mat_embed[i_w,1], vocab[i_w])
#plt.colorbar(sca)
plt.box(False)
plt.xticks([])
plt.yticks([])
plt.tight_layout()
#%%
# Commands
# get similar words: model.wv.most_similar('war', topn = 10)
#%%
# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
rank = [docid for docid, sim in sims].index(doc_id)
ranks.append(rank)
second_ranks.append(sims[1])
#%%
# Mapping of doc vecs
feat_mat_doc = np.zeros((model.docvecs.count,model.vector_size))
for i_c in np.arange(model.docvecs.count):
feat_mat_doc[i_c,:] = model.docvecs[i_c]
feat_mat_doc_embed = TSNE(n_components = 2).fit_transform(feat_mat_doc)
n_docs = 300
plt.figure(figsize=(40, 20))
sca = plt.scatter(feat_mat_doc_embed[:,0], feat_mat_doc_embed[:,1])
#plt.colorbar(sca)
plt.box(False)
plt.xticks([])
plt.yticks([])
plt.tight_layout()
'''
\ No newline at end of file
...@@ -1023,6 +1023,36 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l): ...@@ -1023,6 +1023,36 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
return XML_new return XML_new
# Simply, given the number of ocurrences of the stopwords, it assigns a label
# to a specific textbox, also considering the possibility of textboxes
# mixing languages. For this case, the value ratio_similar is intended
# input:
# - aux_dict_l: corresponds to dict_language_counts
# output:
# - lang_max: string
def label_language_simple(aux_dict_l):
# specify a similarity ratio
ratio_similar = 0.8
# if there are counts, determine language
if sum(aux_dict_l.values()):
aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()}
lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key]))
lang_max = ''
count_l = 0
for lang in aux_dict_l_norm.keys():
if (aux_dict_l_norm[lang] > aux_dict_l_norm[lang_max_aux] * ratio_similar):
if count_l > 0:
lang_max += '_'
lang_max += lang
count_l += 1
if count_l > 1:
lang_max = 'mixed_' + lang_max
else:
lang_max = 'languageNotIdentified'
return lang_max
def get_cities(list_citizenship): def get_cities(list_citizenship):
return [city[:-5] for item in list_citizenship for city in item.split(',')] return [city[:-5] for item in list_citizenship for city in item.split(',')]
......