Skip to content
Snippets Groups Projects
Commit 82ea87c0 authored by Lili Gasser's avatar Lili Gasser
Browse files

mend

parent f279c6bd
No related branches found
No related tags found
2 merge requests!8Extractdiscussions,!7extractdiscussions
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#%%
%load_ext autoreload
%autoreload 2
import os
import pickle
......@@ -14,6 +17,7 @@ sys.path.append('src/python/')
import def_classes as defc
import utils_proc
import utils_annot
#%%
# needed for running in atom, can be ignored
input_lastnames = "data/politicians/lastnames/1891_lastnames.pickle"
......@@ -22,19 +26,22 @@ input_correctedmeta = "data/AB/1891/03_correctedmeta.tar.gz"
output_annotatedxml = "data/AB/1891/05_annotatedxml.tar.gz"
# detect arguments
input_lastnames = sys.argv[1]
input_correctedxml = sys.argv[2]
input_correctedmeta = sys.argv[3]
output_annotatedxml = sys.argv[4]
#input_lastnames = sys.argv[1]
#input_correctedxml = sys.argv[2]
#input_correctedmeta = sys.argv[3]
#output_annotatedxml = sys.argv[4]
# extract suffix, year, folder_database
#%%
# extract suffixes, year, folder_database
suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0]
suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0]
year = (input_correctedxml.split('/')[-2])
year = input_correctedxml.split('/')[-2]
folder_database = input_correctedxml.split(year)[0]
suffix_correctedmeta = '_metacorr'
suffix_correctedxml = '_datacorr'
#%%
# print some output
print(year)
print(type(year))
......@@ -42,7 +49,7 @@ print(input_correctedxml)
print(input_lastnames)
print(input_correctedmeta)
print(folder_database)
#%%
# pull necessary data
# TODO pull necessary data from here
#!!git lfs pull origin -I input_correctedxml
......@@ -54,7 +61,7 @@ print(folder_database)
# get dictionary of discussions
# -----------------------------
#%%
start_time_discussions = time.time()
print('start to identify discussions of the year', year, '\n\n\n')
......@@ -71,8 +78,7 @@ list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
list_stopwords.extend(stopwords.words('french'))
list_stopwords.extend(['ils', 'les', 'celle'])
# string for this year
str_year = str(year)
#%%
# initialize empty dictionary for that year
dict_year = {}
......@@ -82,6 +88,7 @@ files_to_process.sort()
print(files_to_process)
#meta_files, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedmeta)
#%%
# open dataframe of last names from pickle file
with open(input_lastnames, 'rb') as f:
......@@ -89,25 +96,48 @@ with open(input_lastnames, 'rb') as f:
print(df_lastnames)
df_lastnames.columns
#%%
file_tarpath = './1891/20026440_datacorr.xml'
file_number = file_tarpath.split('/')[-1][:8]
metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta)
# instantiate document object (always from original pdf)
infile_aux = year + '/' + file_number + '.pdf'
file_doc = defc.Document(infile_aux, folder_database)
file_doc
if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
print(file_number + '\n')
#%%
# for each file
for file_tarpath in files_to_process:
# print(file_tarpath)
file_number = file_tarpath.split('/')[-1][:8]
metafile_tarpath = './{}/{}{}.xml'.format(str_year, file_number, suffix_correctedmeta)
# instantiate document object (always from original pdf)
infile_aux = str_year + '/' + file_number + '.pdf'
file_doc = defc.Document(infile_aux, folder_database)
file_aux = year + '/' + file_number + '.pdf'
file_doc = defc.Document(file_aux, folder_database)
# ----> CONTINUE HERE NEXT TIME!!!!!!
#path_number = path_data + str_year + '/' + number + '/'
# if document is a discussion
if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
print(file_number + '\n')
#%%
## get dictionary with text
#dict_text = hf.get_text_onefile(path_number + number + '_datacorr.xml')
## exclude parts from previous and next document
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment