Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
democrasci_preprocWP1
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marta Balode
democrasci_preprocWP1
Compare revisions
master to 58d9762dff68690968c051527e305580b86dfe45
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
marta.paula.balode/democrasci_preprocwp1
Select target project
No results found
58d9762dff68690968c051527e305580b86dfe45
Select Git revision
Swap
Target
luis.salamanca/democrasci_preprocwp1
Select target project
luis.salamanca/democrasci_preprocwp1
rok.roskar/democrasci_preprocwp1
marta.paula.balode/democrasci_preprocwp1
lorenzo.cavazzi.tech/democrasci-preprocwp1
4 results
master
Select Git revision
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (2)
Test branches
· 7ed03ba3
Luis Salamanca
authored
5 years ago
7ed03ba3
Function for embedding, plotting paragraphs and writing to txt
· 58d9762d
Luis Salamanca
authored
5 years ago
58d9762d
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/python/def_classes.py
+1
-1
1 addition, 1 deletion
src/python/def_classes.py
src/python/embed_parag.py
+416
-0
416 additions, 0 deletions
src/python/embed_parag.py
src/python/utils_annot.py
+30
-0
30 additions, 0 deletions
src/python/utils_annot.py
with
447 additions
and
1 deletion
src/python/def_classes.py
View file @
58d9762d
...
@@ -29,7 +29,7 @@ import preproc_docs
...
@@ -29,7 +29,7 @@ import preproc_docs
# Comment to test branches
# Definition of classes and methods associated
# Definition of classes and methods associated
class
Document
:
class
Document
:
...
...
This diff is collapsed.
Click to expand it.
src/python/embed_parag.py
0 → 100644
View file @
58d9762d
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 13:04:45 2019
@author: luissalamanca
"""
import
gensim
import
os
import
copy
import
smart_open
import
random
import
time
from
sklearn.manifold
import
TSNE
import
numpy
as
np
import
matplotlib.pyplot
as
plt
#sys.path.append('./src/python/')
import
utils_proc
as
ut_p
import
utils_annot
as
ut_a
import
xml.etree.ElementTree
as
ET
from
tmtoolkit.preprocess
import
TMPreproc
from
scipy
import
linalg
import
itertools
import
matplotlib
as
mpl
from
sklearn
import
mixture
from
sklearn.feature_extraction.text
import
TfidfVectorizer
import
goslate
gs
=
goslate
.
Goslate
()
#%%
color_iter
=
itertools
.
cycle
([
'
navy
'
,
'
c
'
,
'
cornflowerblue
'
,
'
gold
'
,
'
darkorange
'
])
def
plot_results
(
X
,
Y_
,
means
,
covariances
,
title
):
splot
=
plt
.
subplot
(
1
,
1
,
1
)
for
i
,
(
mean
,
covar
,
color
)
in
enumerate
(
zip
(
means
,
covariances
,
color_iter
)):
v
,
w
=
linalg
.
eigh
(
covar
)
v
=
2.
*
np
.
sqrt
(
2.
)
*
np
.
sqrt
(
v
)
u
=
w
[
0
]
/
linalg
.
norm
(
w
[
0
])
# as the DP will not use every component it has access to
# unless it needs it, we shouldn't plot the redundant
# components.
if
not
np
.
any
(
Y_
==
i
):
continue
plt
.
scatter
(
X
[
Y_
==
i
,
0
],
X
[
Y_
==
i
,
1
],
.
8
,
color
=
color
)
# Plot an ellipse to show the Gaussian component
angle
=
np
.
arctan
(
u
[
1
]
/
u
[
0
])
angle
=
180.
*
angle
/
np
.
pi
# convert to degrees
ell
=
mpl
.
patches
.
Ellipse
(
mean
,
v
[
0
],
v
[
1
],
180.
+
angle
,
color
=
color
)
ell
.
set_clip_box
(
splot
.
bbox
)
ell
.
set_alpha
(
0.5
)
splot
.
add_artist
(
ell
)
#plt.xlim(-9., 5.)
#plt.ylim(-3., 6.)
plt
.
xticks
(())
plt
.
yticks
(())
plt
.
title
(
title
)
def
train_doc2vec
(
train_corpus
,
vector_size
=
100
,
min_count
=
5
,
epochs
=
40
):
model
=
gensim
.
models
.
doc2vec
.
Doc2Vec
(
vector_size
=
vector_size
,
min_count
=
min_count
,
epochs
=
epochs
)
model
.
build_vocab
(
train_corpus
)
st_t
=
time
.
time
()
model
.
train
(
train_corpus
,
total_examples
=
model
.
corpus_count
,
epochs
=
model
.
epochs
)
print
(
'
Time training %f
'
%
(
time
.
time
()
-
st_t
))
return
model
# 18 seconds for 8220 paragraphs and 13k words for the vocabulary, and 100 dimensiones
# 127 seconds for 35995 paragraphs, corresponding only to german, and 100 dim
def
feat_tsne
(
model
):
feat_mat_doc
=
np
.
zeros
((
model
.
docvecs
.
count
,
model
.
vector_size
))
for
i_c
in
np
.
arange
(
model
.
docvecs
.
count
):
feat_mat_doc
[
i_c
,:]
=
model
.
docvecs
[
i_c
]
st_t
=
time
.
time
()
feat_mat_doc_embed
=
TSNE
(
n_components
=
2
).
fit_transform
(
feat_mat_doc
)
print
(
'
Time %f
'
%
(
time
.
time
()
-
st_t
))
return
feat_mat_doc
,
feat_mat_doc_embed
# 1562 seconds for 35995 parag and 100 dim
def
scatter_lang
(
feat_mat_doc_embed
,
lang_use
,
list_lang
):
plt
.
figure
(
figsize
=
(
40
,
20
))
if
lang_use
==
'
all
'
:
ind_c
=
np
.
zeros
(
len
(
list_lang
))
for
i_l
,
lang
in
enumerate
(
np
.
unique
(
np
.
array
(
list_lang
))):
ind_p
=
np
.
argwhere
(
np
.
array
(
list_lang
)
==
lang
)
if
len
(
ind_p
):
ind_c
[
ind_p
]
=
i_l
plt
.
scatter
(
feat_mat_doc_embed
[
ind_p
,
0
],
feat_mat_doc_embed
[
ind_p
,
1
],
label
=
lang
,
alpha
=
0.6
)
else
:
plt
.
scatter
(
feat_mat_doc_embed
[:,
0
],
feat_mat_doc_embed
[:,
1
],
alpha
=
0.6
)
#plt.colorbar(sca)
plt
.
box
(
False
)
plt
.
legend
()
plt
.
xticks
([])
plt
.
yticks
([])
plt
.
tight_layout
()
def
fit_gmm_plot
(
feat_mat
,
n_comp
=
10
,
cov_t
=
'
full
'
,
flag_plot
=
0
):
dpgmm
=
mixture
.
BayesianGaussianMixture
(
n_components
=
n_comp
,
covariance_type
=
cov_t
).
fit
(
feat_mat
)
res_pred
=
np
.
array
(
dpgmm
.
predict
(
feat_mat
))
if
flag_plot
:
plot_results
(
feat_mat
,
res_pred
,
dpgmm
.
means_
,
dpgmm
.
covariances_
,
'
Bayesian Gaussian Mixture with a Dirichlet process prior
'
)
plt
.
show
()
return
dpgmm
,
res_pred
def
scatter_classes
(
feat_mat_embed
,
res_pred
):
plt
.
figure
(
figsize
=
(
40
,
20
))
for
i_l
,
clust
in
enumerate
(
np
.
unique
(
res_pred
)):
ind_p
=
np
.
argwhere
(
res_pred
==
clust
)
if
len
(
ind_p
):
plt
.
scatter
(
feat_mat_embed
[
ind_p
,
0
],
feat_mat_embed
[
ind_p
,
1
],
label
=
clust
,
alpha
=
0.6
)
plt
.
box
(
False
)
plt
.
legend
()
plt
.
xticks
([])
plt
.
yticks
([])
plt
.
tight_layout
()
def
write_to_txt
(
text_par_dict
,
res_pred
,
str_ex
=
''
,
folder_res
=
'
../../../
'
):
folder_res
=
'
../../../
'
for
clust
in
np
.
unique
(
res_pred
):
fp_o
=
open
(
folder_res
+
'
Text_in_Clust
'
+
str
(
clust
)
+
str_ex
+
'
.txt
'
,
'
w+
'
)
ind_p
=
np
.
argwhere
(
res_pred
==
clust
)
for
i_p
in
ind_p
:
fp_o
.
write
(
text_par_dict
[
int
(
i_p
)]
+
'
\n
'
)
fp_o
.
write
(
'
\n
'
)
fp_o
.
close
()
def
rem_clusters
(
cl_keep
,
train_corpus
,
list_lang
,
text_par_dict
):
train_corpus_red
=
list
()
list_lang_red
=
list
()
text_par_dict_red
=
dict
()
count
=
0
for
i_c
in
range
(
len
(
train_corpus
)):
if
res_pred
[
i_c
]
in
cl_keep
:
aux_in
=
gensim
.
models
.
doc2vec
.
TaggedDocument
(
train_corpus
[
i_c
][
0
],
[
count
])
train_corpus_red
.
append
(
aux_in
)
list_lang_red
.
append
(
list_lang
[
i_c
])
text_par_dict_red
[
count
]
=
text_par_dict
[
int
(
i_c
)]
count
+=
1
return
train_corpus_red
,
list_lang_red
,
text_par_dict_red
#%%
# In this function I am using paragraph embeddings to classify the different
# sections in the documents. In principle, we should have the following: laws,
# votes and speeches. Though there might probably exist much more, with more
# subtle differences, like amendments, etc.
folder_database
=
'
../../data/AB/
'
years
=
np
.
arange
(
1891
,
1899
)
name_meta
=
'
01_rawmeta
'
name_outcorrxml
=
'
04_correctedxml
'
#%%
# Generate initial corpus
train_corpus
=
list
()
list_lang
=
list
()
count_par
=
0
lang_use
=
'
all
'
flag_lemma
=
0
text_par_dict
=
dict
()
flag_byblock
=
1
for
year
in
years
:
print
(
'
Year: %d
'
%
year
)
files
,
list_ids
=
ut_p
.
get_list
(
year
,
folder_database
,
name_outcorrxml
)
for
i_file
in
range
(
len
(
files
)):
name_xml
=
files
[
i_file
]
name_xml_meta
=
name_xml
.
split
(
'
_
'
)[
0
]
+
'
.xml
'
path_meta_xml_file
=
ut_p
.
get_handlerfile
(
name_xml_meta
,
folder_database
,
name_meta
)
disc_flag
=
ut_a
.
check_if_discussion
(
path_meta_xml_file
)
if
disc_flag
:
h_xml
=
ut_p
.
get_handlerfile
(
name_xml
,
folder_database
,
name_outcorrxml
)
XML_tree
=
ET
.
parse
(
h_xml
)
XML_main
=
XML_tree
.
getroot
()
not_end
=
1
i_p
=
0
;
i_t
=
0
while
not_end
:
# for i_p in range(len(XML_main)):
# for i_t in range(len(XML_main[i_p])):
# We group by blocks
text_par
=
''
if
flag_byblock
:
flag_block
=
1
if
XML_main
[
i_p
][
i_t
].
tag
==
'
textbox
'
:
id_block
=
XML_main
[
i_p
][
i_t
].
attrib
[
'
block
'
]
while
flag_block
:
if
XML_main
[
i_p
][
i_t
].
tag
==
'
textbox
'
:
if
XML_main
[
i_p
][
i_t
].
attrib
[
'
block
'
]
==
id_block
:
type_t
=
ut_a
.
get_textbox_type
(
XML_main
[
i_p
][
i_t
])
if
type_t
in
(
'
text_col1
'
,
'
text_col2
'
):
text_par
=
text_par
+
'
'
+
ut_a
.
get_complete_text
(
XML_main
[
i_p
][
i_t
])[
0
]
else
:
flag_block
=
0
else
:
flag_block
=
0
i_t
+=
1
if
i_t
==
len
(
XML_main
[
i_p
]):
i_p
+=
1
;
i_t
=
0
if
i_p
==
len
(
XML_main
):
flag_block
=
0
;
not_end
=
0
else
:
if
XML_main
[
i_p
][
i_t
].
tag
==
'
textbox
'
:
type_t
=
ut_a
.
get_textbox_type
(
XML_main
[
i_p
][
i_t
])
if
type_t
in
(
'
text_col1
'
,
'
text_col2
'
):
text_par
=
ut_a
.
get_complete_text
(
XML_main
[
i_p
][
i_t
])[
0
]
i_t
+=
1
if
i_t
==
len
(
XML_main
[
i_p
]):
i_p
+=
1
;
i_t
=
0
if
i_p
==
len
(
XML_main
):
not_end
=
0
# Perform stemmitazion
if
len
(
text_par
.
strip
(
'
'
)):
dict_lang
=
ut_a
.
identify_language
(
text_par
)
label_language
=
ut_a
.
label_language_simple
(
dict_lang
)
list_lang
.
append
(
label_language
)
if
lang_use
==
'
all
'
:
train_corpus
.
append
(
gensim
.
models
.
doc2vec
.
TaggedDocument
(
gensim
.
utils
.
simple_preprocess
(
text_par
),
[
count_par
]))
text_par_dict
[
count_par
]
=
text_par
count_par
+=
1
else
:
if
label_language
==
lang_use
:
if
flag_lemma
:
name_k
=
'
doc
'
+
str
(
count_par
)
text_par_dict
[
name_k
]
=
text_par
#text_par_d = {'doc1': text_par}
#preproc = TMPreproc(text_par_d, language = lang_use)
#st_t = time.time()
#preproc.tokenize().pos_tag().lemmatize()
#print('Time lemmatize %f' % (time.time() - st_t))
#train_corpus.append(gensim.models.doc2vec.TaggedDocument(list(preproc.tokens['doc1']), [count_par]))
else
:
train_corpus
.
append
(
gensim
.
models
.
doc2vec
.
TaggedDocument
(
gensim
.
utils
.
simple_preprocess
(
text_par
),
[
count_par
]))
count_par
+=
1
if
flag_lemma
:
preproc
=
TMPreproc
(
text_par_dict
,
language
=
lang_use
)
st_t
=
time
.
time
()
preproc
.
tokenize
().
pos_tag
().
lemmatize
()
print
(
'
Time lemmatize %f
'
%
(
time
.
time
()
-
st_t
))
st_t
=
time
.
time
()
[
train_corpus
.
append
(
gensim
.
models
.
doc2vec
.
TaggedDocument
(
preproc
.
tokens
[
'
doc
'
+
str
(
i_c
)],
[
i_c
]))
for
i_c
in
range
(
len
(
text_par_dict
))]
print
(
'
Build train corpus %f
'
%
(
time
.
time
()
-
st_t
))
#%%
# First iteration
model
=
train_doc2vec
(
train_corpus
,
vector_size
=
50
,
min_count
=
5
)
feat_mat_doc
,
feat_mat_doc_embed
=
feat_tsne
(
model
)
scatter_lang
(
feat_mat_doc_embed
,
lang_use
,
list_lang
)
dpgmm
,
res_pred
=
fit_gmm_plot
(
feat_mat_doc
,
n_comp
=
10
,
cov_t
=
'
full
'
,
flag_plot
=
0
)
scatter_classes
(
feat_mat_doc_embed
,
res_pred
)
#%%
write_to_txt
(
text_par_dict
,
res_pred
)
#%%
# Second iteration
train_corpus_red1
,
list_lang_red1
,
text_par_dict_red1
=
rem_clusters
([
4
,
7
],
train_corpus
,
list_lang
,
text_par_dict
)
model_red1
=
train_doc2vec
(
train_corpus_red1
,
vector_size
=
100
,
min_count
=
5
)
feat_mat_doc_red1
,
feat_mat_doc_embed_red1
=
feat_tsne
(
model_red1
)
scatter_lang
(
feat_mat_doc_embed_red1
,
lang_use
,
list_lang_red1
)
dpgmm_red1
,
res_pred_red1
=
fit_gmm_plot
(
feat_mat_doc_red1
,
n_comp
=
10
,
cov_t
=
'
diag
'
,
flag_plot
=
0
)
scatter_classes
(
feat_mat_doc_embed_red1
,
res_pred_red1
)
#%%
write_to_txt
(
text_par_dict_red1
,
res_pred_red1
,
str_ex
=
'
_Red1
'
)
#%%
ind_f
=
np
.
argwhere
(
np
.
array
(
list_lang_red1
)
==
'
french
'
)
from
googletrans
import
Translator
translator
=
Translator
()
# <Translated src=ko dest=en text=Good evening. pronunciation=Good evening.>
translator
.
translate
(
text_par_dict_red1
[
int
(
ind_f
[
2
])],
dest
=
'
de
'
)
# <Translated src=ko dest=ja text=こんにちは。 pronunciation=Kon'nichiwa.>
#%%
'''
# Set file names for train and test data
test_data_dir =
'
{}
'
.format(os.sep).join([gensim.__path__[0],
'
test
'
,
'
test_data
'
])
lee_train_file = test_data_dir + os.sep +
'
lee_background.cor
'
lee_test_file = test_data_dir + os.sep +
'
lee.cor
'
#%%
# smart_open can be used with really long files in an optimal way, as it
# streams the data
def read_corpus(fname, tokens_only=False):
with smart_open.smart_open(fname, encoding=
"
iso-8859-1
"
) as f:
for i, line in enumerate(f):
if tokens_only:
yield gensim.utils.simple_preprocess(line)
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
# yield works like return, but with iterables elements. It runs the function
# until yield, return the element, and run it again. This until the for has
# been emptied
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
#%%
# Training
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
st_t = time.time()
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print(
'
Time %f
'
% (time.time() - st_t))
#%%
# Get all word vectors, and plot embeddings
vocab = list(model.wv.vocab)
feat_mat = np.zeros((len(vocab),model.vector_size))
for i_c, word in enumerate(vocab):
feat_mat[i_c,:] = model.wv.get_vector(word)
feat_mat_embed = TSNE(n_components = 2).fit_transform(feat_mat)
#%%
n_words = 2000
plt.figure(figsize=(40, 20))
sca = plt.scatter(feat_mat_embed[:,0], feat_mat_embed[:,1])
ind_rand = np.random.permutation(len(vocab))
for i_w in ind_rand[:n_words]:
plt.text(feat_mat_embed[i_w,0], feat_mat_embed[i_w,1], vocab[i_w])
#plt.colorbar(sca)
plt.box(False)
plt.xticks([])
plt.yticks([])
plt.tight_layout()
#%%
# Commands
# get similar words: model.wv.most_similar(
'
war
'
, topn = 10)
#%%
# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
rank = [docid for docid, sim in sims].index(doc_id)
ranks.append(rank)
second_ranks.append(sims[1])
#%%
# Mapping of doc vecs
feat_mat_doc = np.zeros((model.docvecs.count,model.vector_size))
for i_c in np.arange(model.docvecs.count):
feat_mat_doc[i_c,:] = model.docvecs[i_c]
feat_mat_doc_embed = TSNE(n_components = 2).fit_transform(feat_mat_doc)
n_docs = 300
plt.figure(figsize=(40, 20))
sca = plt.scatter(feat_mat_doc_embed[:,0], feat_mat_doc_embed[:,1])
#plt.colorbar(sca)
plt.box(False)
plt.xticks([])
plt.yticks([])
plt.tight_layout()
'''
\ No newline at end of file
This diff is collapsed.
Click to expand it.
src/python/utils_annot.py
View file @
58d9762d
...
@@ -1023,6 +1023,36 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
...
@@ -1023,6 +1023,36 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
return
XML_new
return
XML_new
# Simply, given the number of ocurrences of the stopwords, it assigns a label
# to a specific textbox, also considering the possibility of textboxes
# mixing languages. For this case, the value ratio_similar is intended
# input:
# - aux_dict_l: corresponds to dict_language_counts
# output:
# - lang_max: string
def
label_language_simple
(
aux_dict_l
):
# specify a similarity ratio
ratio_similar
=
0.8
# if there are counts, determine language
if
sum
(
aux_dict_l
.
values
()):
aux_dict_l_norm
=
{
k
:
v
/
total
for
total
in
(
sum
(
aux_dict_l
.
values
()),)
for
k
,
v
in
aux_dict_l
.
items
()}
lang_max_aux
=
max
(
aux_dict_l_norm
.
keys
(),
key
=
(
lambda
key
:
aux_dict_l_norm
[
key
]))
lang_max
=
''
count_l
=
0
for
lang
in
aux_dict_l_norm
.
keys
():
if
(
aux_dict_l_norm
[
lang
]
>
aux_dict_l_norm
[
lang_max_aux
]
*
ratio_similar
):
if
count_l
>
0
:
lang_max
+=
'
_
'
lang_max
+=
lang
count_l
+=
1
if
count_l
>
1
:
lang_max
=
'
mixed_
'
+
lang_max
else
:
lang_max
=
'
languageNotIdentified
'
return
lang_max
def
get_cities
(
list_citizenship
):
def
get_cities
(
list_citizenship
):
return
[
city
[:
-
5
]
for
item
in
list_citizenship
for
city
in
item
.
split
(
'
,
'
)]
return
[
city
[:
-
5
]
for
item
in
list_citizenship
for
city
in
item
.
split
(
'
,
'
)]
...
...
This diff is collapsed.
Click to expand it.