#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Code to extract discussions from corrected XML files
#%%
# to work with atom
%load_ext autoreload
%autoreload 2

import pickle
import time
import xml.etree.ElementTree as ET

import sys
sys.path.append('src/python/')

import def_classes as defc
import utils_proc
import utils_annot

import os

from utils_proc import call_with_out

#%%
# specify input and output files

# needed for running in atom, can be ignored
year = '1936'
input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle"
input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
input_notnames = "data/lists/not_names.txt"
input_overlaps = "data/lists/dict_overlaps.pickle"
output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"

#%%
# detect arguments
input_lastnames = sys.argv[1]
input_correctedxml = sys.argv[2]
input_correctedmeta = sys.argv[3]
input_notnames = sys.argv[4]
input_overlaps = sys.argv[5]
output_annotatedxml = sys.argv[6]

#%%
# extract suffixes, year, folder_database
suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0]
#suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0]
year = input_correctedxml.split('/')[-2]
folder_database = input_correctedxml.split(year)[0]
suffix_correctedmeta = '_metacorr'
#suffix_correctedxml = '_datacorr'

#%%
# TODO: make it work!
# git lfs pull necessary data
for lfsfile in [input_correctedxml, input_correctedmeta]:
    command = 'git lfs pull -I ' + lfsfile
    #print(command)
    call_with_out(command)

#%%
# TODO: exclude overlaps --> after annotation


#%%
start_time_discussions = time.time()
print('start to identify discussions of the year', year, '\n')

# extract list of files
files_to_process, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedxml)
files_to_process.sort()
print('files to process loaded:', files_to_process)

# open dataframe of last names from pickle file
# (there is one file of lastnames per year)
with open(input_lastnames, 'rb') as f:
    df_lastnames = pickle.load(f)
df_lastnames = df_lastnames.fillna('')

print('dataframe with lastnames loaded')

# open list of terms that are easily mistaken as names
with open(input_notnames) as f:
    list_notnames = f.readlines()

list_notnames = [term.rstrip() for term in list_notnames]
print('list of notnames loaded')

# open dictionary of overlaps from pickle file
with open(input_overlaps, 'rb') as f:
    dict_overlaps = pickle.load(f)
dict_overlaps_year = dict_overlaps[int(year)]
print('dictionary of overlaps loaded')


#%%
# for each file
for file_tarpath in files_to_process:
    #print(file_tarpath)
    id_doc = file_tarpath.split('/')[-1][:8]

    # instantiate document object (always from original pdf)
    file_aux = year + '/' + id_doc + '.pdf'
    file_doc = defc.Document(file_aux, folder_database)

    # if document is a discussion
    if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
        print(id_doc + '\n')
        file_doc.df_lastnames = df_lastnames
        file_doc.list_notnames = list_notnames
        file_doc.dict_overlaps_year = dict_overlaps_year
        file_doc.annotate_xml()

# Commands to get the compressegid version of the file
# (compressed file is around 5 times smaller than uncompressed file)
#data/AB/${year}/05_annotatedxml.tar.gz
utils_proc.compress_tar(output_annotatedxml)




#%%
with open(input_notnames) as f:
    list_notnames = f.readlines()

list_notnames = [term.rstrip() for term in list_notnames]

# to test for one file
file_tarpath = './1936/20031998_datacorr.xml'

id_doc = file_tarpath.split('/')[-1][:8]

# instantiate document object (always from original pdf)
infile_aux = year + '/' + id_doc + '.pdf'
file_doc = defc.Document(infile_aux, folder_database)


if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
    print(id_doc + '\n')

    file_doc.df_lastnames = df_lastnames
    file_doc.list_notnames = list_notnames
    file_doc.dict_overlaps_year = dict_overlaps_year
    file_doc.annotate_xml()


#%%

file_doc = defc.Document(infile_aux, folder_database)
file_doc.get_council_date()
#id_doc

#len(files_to_process)
file_doc.check_discussion()

str_date = '1925-12-09 08:00'
import datetime
datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')