Compare revisions

Luis Salamanca · Luis Salamanca · 58d9762d · 58d9762d · 58d9762d
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -29,7 +29,7 @@ import preproc_docs
+# Comment to test branches
 # Definition of classes and methods associated
 class Document:

--- a/src/python/embed_parag.py
+++ b/src/python/embed_parag.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Mar 25 13:04:45 2019
+@author: luissalamanca
+"""
+import gensim
+import os
+import copy
+import smart_open
+import random
+import time
+from sklearn.manifold import TSNE
+import numpy as np
+import matplotlib.pyplot as plt
+#sys.path.append('./src/python/')
+import utils_proc as ut_p
+import utils_annot as ut_a
+import xml.etree.ElementTree as ET
+from tmtoolkit.preprocess import TMPreproc
+from scipy import linalg
+import itertools
+import matplotlib as mpl
+from sklearn import mixture
+from sklearn.feature_extraction.text import TfidfVectorizer
+import goslate
+gs = goslate.Goslate()
+#%%
+color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
+                              'darkorange'])
+def plot_results(X, Y_, means, covariances, title):
+    splot = plt.subplot(1, 1, 1)
+    for i, (mean, covar, color) in enumerate(zip(
+            means, covariances, color_iter)):
+        v, w = linalg.eigh(covar)
+        v = 2. * np.sqrt(2.) * np.sqrt(v)
+        u = w[0] / linalg.norm(w[0])
+        # as the DP will not use every component it has access to
+        # unless it needs it, we shouldn't plot the redundant
+        # components.
+        if not np.any(Y_ == i):
+            continue
+        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
+        # Plot an ellipse to show the Gaussian component
+        angle = np.arctan(u[1] / u[0])
+        angle = 180. * angle / np.pi  # convert to degrees
+        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
+        ell.set_clip_box(splot.bbox)
+        ell.set_alpha(0.5)
+        splot.add_artist(ell)
+    #plt.xlim(-9., 5.)
+    #plt.ylim(-3., 6.)
+    plt.xticks(())
+    plt.yticks(())
+    plt.title(title)
+def train_doc2vec(train_corpus, vector_size=100, min_count=5, epochs=40):
+    model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs)
+    model.build_vocab(train_corpus)
+    st_t = time.time()
+    model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)
+    print('Time training %f' % (time.time() - st_t))
+    return model
+# 18 seconds for 8220 paragraphs and 13k words for the vocabulary, and 100 dimensiones
+# 127 seconds for 35995 paragraphs, corresponding only to german, and 100 dim
+def feat_tsne(model):
+    feat_mat_doc = np.zeros((model.docvecs.count,model.vector_size))
+    for i_c in np.arange(model.docvecs.count):
+        feat_mat_doc[i_c,:] = model.docvecs[i_c]
+    st_t = time.time()
+    feat_mat_doc_embed = TSNE(n_components = 2).fit_transform(feat_mat_doc)
+    print('Time %f' % (time.time() - st_t))
+    return feat_mat_doc, feat_mat_doc_embed
+    # 1562 seconds for 35995 parag and 100 dim
+def scatter_lang(feat_mat_doc_embed, lang_use, list_lang):
+    plt.figure(figsize=(40, 20))  
+    if lang_use == 'all':
+        ind_c = np.zeros(len(list_lang))
+        for i_l, lang in enumerate(np.unique(np.array(list_lang))):
+            ind_p = np.argwhere(np.array(list_lang) == lang)
+            if len(ind_p):
+                ind_c[ind_p] = i_l
+                plt.scatter(feat_mat_doc_embed[ind_p,0], feat_mat_doc_embed[ind_p,1], label = lang, alpha = 0.6)
+    else:
+        plt.scatter(feat_mat_doc_embed[:,0], feat_mat_doc_embed[:,1], alpha = 0.6)
+    #plt.colorbar(sca)
+    plt.box(False)
+    plt.legend()
+    plt.xticks([])
+    plt.yticks([])
+    plt.tight_layout()    
+def fit_gmm_plot(feat_mat, n_comp = 10, cov_t = 'full', flag_plot = 0):
+    dpgmm = mixture.BayesianGaussianMixture(n_components = n_comp,
+                                            covariance_type = cov_t).fit(feat_mat)
+    res_pred = np.array(dpgmm.predict(feat_mat))
+    if flag_plot:
+        plot_results(feat_mat, res_pred, dpgmm.means_, dpgmm.covariances_,
+                     'Bayesian Gaussian Mixture with a Dirichlet process prior')    
+        plt.show()
+    return dpgmm, res_pred
+def scatter_classes(feat_mat_embed, res_pred):    
+    plt.figure(figsize=(40, 20)) 
+    for i_l, clust in enumerate(np.unique(res_pred)):
+        ind_p = np.argwhere(res_pred == clust)
+        if len(ind_p):
+            plt.scatter(feat_mat_embed[ind_p,0], feat_mat_embed[ind_p,1], label = clust, alpha = 0.6)
+    plt.box(False)
+    plt.legend()
+    plt.xticks([])
+    plt.yticks([])
+    plt.tight_layout()     
+def write_to_txt(text_par_dict, res_pred, str_ex = '', folder_res = '../../../'):
+    folder_res = '../../../'
+    for clust in np.unique(res_pred):
+        fp_o = open(folder_res + 'Text_in_Clust' + str(clust) + str_ex + '.txt', 'w+')  
+        ind_p = np.argwhere(res_pred == clust)
+        for i_p in ind_p:
+            fp_o.write(text_par_dict[int(i_p)] + '\n')
+            fp_o.write('\n')
+        fp_o.close()
+def rem_clusters(cl_keep, train_corpus, list_lang, text_par_dict):
+    train_corpus_red = list()
+    list_lang_red = list()
+    text_par_dict_red = dict()
+    count = 0
+    for i_c in range(len(train_corpus)):
+        if res_pred[i_c] in cl_keep:
+            aux_in = gensim.models.doc2vec.TaggedDocument(train_corpus[i_c][0], [count])
+            train_corpus_red.append(aux_in)
+            list_lang_red.append(list_lang[i_c])
+            text_par_dict_red[count] = text_par_dict[int(i_c)]
+            count += 1
+    return train_corpus_red, list_lang_red, text_par_dict_red
+#%%
+# In this function I am using paragraph embeddings to classify the different 
+# sections in the documents. In principle, we should have the following: laws,
+# votes and speeches. Though there might probably exist much more, with more
+# subtle differences, like amendments, etc.
+folder_database = '../../data/AB/'
+years = np.arange(1891,1899)
+name_meta = '01_rawmeta'
+name_outcorrxml = '04_correctedxml'
+#%%
+# Generate initial corpus
+train_corpus = list()
+list_lang = list()
+count_par = 0
+lang_use = 'all'
+flag_lemma = 0
+text_par_dict=dict()
+flag_byblock = 1
+for year in years:
+    print('Year: %d' % year)
+    files, list_ids = ut_p.get_list(year, folder_database, name_outcorrxml)
+    for i_file in range(len(files)):
+        name_xml = files[i_file]
+        name_xml_meta = name_xml.split('_')[0] + '.xml'
+        path_meta_xml_file = ut_p.get_handlerfile(name_xml_meta, folder_database, name_meta)
+        disc_flag = ut_a.check_if_discussion(path_meta_xml_file)        
+        if disc_flag:
+            h_xml = ut_p.get_handlerfile(name_xml, folder_database, name_outcorrxml)
+            XML_tree = ET.parse(h_xml)
+            XML_main = XML_tree.getroot()
+            not_end = 1
+            i_p = 0; i_t = 0
+            while not_end:
+           # for i_p in range(len(XML_main)):
+           #     for i_t in range(len(XML_main[i_p])):
+                # We group by blocks
+                text_par = ''
+                if flag_byblock:
+                    flag_block = 1
+                    if XML_main[i_p][i_t].tag == 'textbox':
+                        id_block = XML_main[i_p][i_t].attrib['block']
+                    while flag_block:
+                        if XML_main[i_p][i_t].tag == 'textbox':
+                            if XML_main[i_p][i_t].attrib['block'] == id_block:
+                                type_t = ut_a.get_textbox_type(XML_main[i_p][i_t])                
+                                if type_t in ('text_col1','text_col2'):
+                                    text_par = text_par + ' ' + ut_a.get_complete_text(XML_main[i_p][i_t])[0]
+                            else:
+                                flag_block = 0
+                        else:
+                            flag_block = 0
+                        i_t += 1
+                        if i_t == len(XML_main[i_p]):
+                            i_p += 1; i_t = 0
+                            if i_p == len(XML_main):
+                                flag_block = 0; not_end = 0
+                else:  
+                    if XML_main[i_p][i_t].tag == 'textbox':
+                        type_t = ut_a.get_textbox_type(XML_main[i_p][i_t])                
+                        if type_t in ('text_col1','text_col2'):
+                            text_par = ut_a.get_complete_text(XML_main[i_p][i_t])[0]
+                    i_t += 1
+                    if i_t == len(XML_main[i_p]):
+                        i_p += 1; i_t = 0
+                        if i_p == len(XML_main):
+                            not_end = 0
+                # Perform stemmitazion                
+                if len(text_par.strip(' ')):
+                    dict_lang = ut_a.identify_language(text_par)
+                    label_language = ut_a.label_language_simple(dict_lang)
+                    list_lang.append(label_language)
+                    if lang_use == 'all':
+                        train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text_par), [count_par]))
+                        text_par_dict[count_par]=text_par
+                        count_par += 1
+                    else:
+                        if label_language == lang_use:
+                            if flag_lemma:
+                                name_k = 'doc' + str(count_par)
+                                text_par_dict[name_k] = text_par
+                                #text_par_d = {'doc1': text_par}
+                                #preproc = TMPreproc(text_par_d, language = lang_use)
+                                #st_t = time.time()
+                                #preproc.tokenize().pos_tag().lemmatize()
+                                #print('Time lemmatize %f' % (time.time() - st_t))
+                                #train_corpus.append(gensim.models.doc2vec.TaggedDocument(list(preproc.tokens['doc1']), [count_par]))                             
+                            else:
+                                train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text_par), [count_par])) 
+                            count_par += 1
+if flag_lemma:   
+    preproc = TMPreproc(text_par_dict, language = lang_use)    
+    st_t = time.time()
+    preproc.tokenize().pos_tag().lemmatize()
+    print('Time lemmatize %f' % (time.time() - st_t))
+    st_t = time.time()
+    [train_corpus.append(gensim.models.doc2vec.TaggedDocument(preproc.tokens['doc' + str(i_c)], [i_c])) for i_c in range(len(text_par_dict))]
+    print('Build train corpus %f' % (time.time() - st_t))
+#%%
+# First iteration
+model = train_doc2vec(train_corpus, vector_size=50, min_count=5)
+feat_mat_doc, feat_mat_doc_embed = feat_tsne(model)
+scatter_lang(feat_mat_doc_embed, lang_use, list_lang)
+dpgmm, res_pred = fit_gmm_plot(feat_mat_doc, n_comp = 10, cov_t = 'full', flag_plot = 0)
+scatter_classes(feat_mat_doc_embed, res_pred)
+#%%
+write_to_txt(text_par_dict, res_pred)
+#%%
+# Second iteration
+train_corpus_red1, list_lang_red1, text_par_dict_red1 = rem_clusters([4,7], train_corpus, list_lang, text_par_dict)
+model_red1 = train_doc2vec(train_corpus_red1, vector_size=100, min_count=5)
+feat_mat_doc_red1, feat_mat_doc_embed_red1 = feat_tsne(model_red1)
+scatter_lang(feat_mat_doc_embed_red1, lang_use, list_lang_red1)
+dpgmm_red1, res_pred_red1 = fit_gmm_plot(feat_mat_doc_red1, n_comp = 10, cov_t = 'diag', flag_plot = 0)
+scatter_classes(feat_mat_doc_embed_red1, res_pred_red1)
+#%%
+write_to_txt(text_par_dict_red1, res_pred_red1, str_ex = '_Red1')
+#%%
+ind_f = np.argwhere(np.array(list_lang_red1) == 'french')
+from googletrans import Translator
+translator = Translator()
+# <Translated src=ko dest=en text=Good evening. pronunciation=Good evening.>
+translator.translate(text_par_dict_red1[int(ind_f[2])], dest='de')
+# <Translated src=ko dest=ja text=こんにちは。 pronunciation=Kon'nichiwa.>
+#%%
+'''
+# Set file names for train and test data
+test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
+lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
+lee_test_file = test_data_dir + os.sep + 'lee.cor'
+#%%
+# smart_open can be used with really long files in an optimal way, as it 
+# streams the data
+def read_corpus(fname, tokens_only=False):
+    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
+        for i, line in enumerate(f):
+            if tokens_only:
+                yield gensim.utils.simple_preprocess(line)
+            else:
+                # For training data, add tags
+                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
+# yield works like return, but with iterables elements. It runs the function
+# until yield, return the element, and run it again. This until the for has
+# been emptied
+train_corpus = list(read_corpus(lee_train_file))
+test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
+#%%
+# Training
+model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
+model.build_vocab(train_corpus)
+st_t = time.time()
+model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
+print('Time %f' % (time.time() - st_t))
+#%% 
+# Get all word vectors, and plot embeddings
+vocab = list(model.wv.vocab)
+feat_mat = np.zeros((len(vocab),model.vector_size))
+for i_c, word in enumerate(vocab):
+    feat_mat[i_c,:] = model.wv.get_vector(word)
+feat_mat_embed = TSNE(n_components = 2).fit_transform(feat_mat)
+#%%
+n_words = 2000
+plt.figure(figsize=(40, 20))  
+sca = plt.scatter(feat_mat_embed[:,0], feat_mat_embed[:,1])
+ind_rand = np.random.permutation(len(vocab))
+for i_w in ind_rand[:n_words]:
+    plt.text(feat_mat_embed[i_w,0], feat_mat_embed[i_w,1], vocab[i_w])
+#plt.colorbar(sca)
+plt.box(False)
+plt.xticks([])
+plt.yticks([])
+plt.tight_layout()  
+#%%
+# Commands
+# get similar words: model.wv.most_similar('war', topn = 10)
+#%%
+# Assessing the model
+ranks = []
+second_ranks = []
+for doc_id in range(len(train_corpus)):
+    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
+    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
+    rank = [docid for docid, sim in sims].index(doc_id)
+    ranks.append(rank)
+    second_ranks.append(sims[1])
+#%%
+# Mapping of doc vecs
+feat_mat_doc = np.zeros((model.docvecs.count,model.vector_size))
+for i_c in np.arange(model.docvecs.count):
+    feat_mat_doc[i_c,:] = model.docvecs[i_c]
+feat_mat_doc_embed = TSNE(n_components = 2).fit_transform(feat_mat_doc)
+n_docs = 300
+plt.figure(figsize=(40, 20))  
+sca = plt.scatter(feat_mat_doc_embed[:,0], feat_mat_doc_embed[:,1])
+#plt.colorbar(sca)
+plt.box(False)
+plt.xticks([])
+plt.yticks([])
+plt.tight_layout()  
+'''
\ No newline at end of file
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -1023,6 +1023,36 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
    return XML_new
+# Simply, given the number of ocurrences of the stopwords, it assigns a label
+# to a specific textbox, also considering the possibility of textboxes
+# mixing languages. For this case, the value ratio_similar is intended
+# input:
+# - aux_dict_l: corresponds to dict_language_counts
+# output:
+# - lang_max: string
+def label_language_simple(aux_dict_l):
+    # specify a similarity ratio
+    ratio_similar = 0.8
+    # if there are counts, determine language
+    if sum(aux_dict_l.values()):
+        aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()}
+        lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key]))
+        lang_max = ''
+        count_l = 0
+        for lang in aux_dict_l_norm.keys():
+            if (aux_dict_l_norm[lang] >  aux_dict_l_norm[lang_max_aux] * ratio_similar):
+                if count_l > 0:
+                    lang_max += '_'
+                lang_max += lang
+                count_l += 1
+        if count_l > 1:
+            lang_max = 'mixed_' + lang_max
+    else:
+        lang_max = 'languageNotIdentified'
+    return lang_max    
 def get_cities(list_citizenship):
    return [city[:-5] for item in list_citizenship for city in item.split(',')]
No results found