#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Code to extract discussions from corrected XML files #%% # to work with atom %load_ext autoreload %autoreload 2 import pickle import time import xml.etree.ElementTree as ET import sys sys.path.append('src/python/') import def_classes as defc import utils_proc import utils_annot import os from utils_proc import call_with_out #%% # specify input and output files # needed for running in atom, can be ignored year = '1936' input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" input_notnames = "data/lists/not_names.txt" input_overlaps = "data/lists/dict_overlaps.pickle" output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" #%% # detect arguments input_lastnames = sys.argv[1] input_correctedxml = sys.argv[2] input_correctedmeta = sys.argv[3] input_notnames = sys.argv[4] input_overlaps = sys.argv[5] output_annotatedxml = sys.argv[6] #%% # extract suffixes, year, folder_database suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0] #suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0] year = input_correctedxml.split('/')[-2] folder_database = input_correctedxml.split(year)[0] suffix_correctedmeta = '_metacorr' #suffix_correctedxml = '_datacorr' #%% # TODO: make it work! # git lfs pull necessary data for lfsfile in [input_correctedxml, input_correctedmeta]: command = 'git lfs pull -I ' + lfsfile #print(command) call_with_out(command) #%% # TODO: exclude overlaps --> after annotation #%% start_time_discussions = time.time() print('start to identify discussions of the year', year, '\n') # extract list of files files_to_process, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedxml) files_to_process.sort() print('files to process loaded:', files_to_process) # open dataframe of last names from pickle file # (there is one file of lastnames per year) with open(input_lastnames, 'rb') as f: df_lastnames = pickle.load(f) df_lastnames = df_lastnames.fillna('') print('dataframe with lastnames loaded') # open list of terms that are easily mistaken as names with open(input_notnames) as f: list_notnames = f.readlines() list_notnames = [term.rstrip() for term in list_notnames] print('list of notnames loaded') # open dictionary of overlaps from pickle file with open(input_overlaps, 'rb') as f: dict_overlaps = pickle.load(f) dict_overlaps_year = dict_overlaps[int(year)] print('dictionary of overlaps loaded') #%% # for each file for file_tarpath in files_to_process: #print(file_tarpath) id_doc = file_tarpath.split('/')[-1][:8] # instantiate document object (always from original pdf) file_aux = year + '/' + id_doc + '.pdf' file_doc = defc.Document(file_aux, folder_database) # if document is a discussion if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): print(id_doc + '\n') file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames file_doc.dict_overlaps_year = dict_overlaps_year file_doc.annotate_xml() # Commands to get the compressegid version of the file # (compressed file is around 5 times smaller than uncompressed file) #data/AB/${year}/05_annotatedxml.tar.gz utils_proc.compress_tar(output_annotatedxml) #%% with open(input_notnames) as f: list_notnames = f.readlines() list_notnames = [term.rstrip() for term in list_notnames] # to test for one file file_tarpath = './1936/20031998_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] # instantiate document object (always from original pdf) infile_aux = year + '/' + id_doc + '.pdf' file_doc = defc.Document(infile_aux, folder_database) if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): print(id_doc + '\n') file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames file_doc.dict_overlaps_year = dict_overlaps_year file_doc.annotate_xml() #%% file_doc = defc.Document(infile_aux, folder_database) file_doc.get_council_date() #id_doc #len(files_to_process) file_doc.check_discussion() str_date = '1925-12-09 08:00' import datetime datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')