diff --git a/.gitattributes b/.gitattributes index 7030f5110e7a8b02a8dabeae2e4254548fcd3bca..5897e0e35513b77240dc873fa6760c20d206a130 100644 --- a/.gitattributes +++ b/.gitattributes @@ -319,3 +319,6 @@ data/AB/1947/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1948/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1949/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1950/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text +data/AB/1976/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text +data/AB/1977/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text +data/AB/1975/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9a187c9b9de280eadceda624e53b087796a9bed4..ac6bfbfb7f6f29aa500e4355baf8a4f94083ffad 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,8 +24,9 @@ image_build: dot: stage: build - image: renku/singleuser:latest + image: renku/renku-python:latest script: + - renku --version - renku log --format dot $(git ls-files --no-empty-directory --recurse-submodules) > graph.dot artifacts: paths: diff --git a/.renku/workflow/5864b37f16a646958e484a578e9ea288_python.cwl b/.renku/workflow/5864b37f16a646958e484a578e9ea288_python.cwl new file mode 100644 index 0000000000000000000000000000000000000000..5bd8208aa235401043c857e852b932b3404e103a --- /dev/null +++ b/.renku/workflow/5864b37f16a646958e484a578e9ea288_python.cwl @@ -0,0 +1,51 @@ +arguments: [] +baseCommand: +- python +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: + class: File + path: ../../src/python/run_correctxml.py + inputBinding: + position: 1 + separate: true + shellQuote: true + streamable: false + type: File + input_2: + default: + class: File + path: ../../data/AB/1976/02_extractedxml.tar.gz + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: File + input_3: + default: data/AB/1976/04_correctedxml.tar.gz + inputBinding: + position: 3 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_3) + streamable: false + type: File +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/AB/1976 + writable: true +successCodes: [] +temporaryFailCodes: [] diff --git a/.renku/workflow/6530d4b63e1045cb9b5af1e1d728cb44_python.cwl b/.renku/workflow/6530d4b63e1045cb9b5af1e1d728cb44_python.cwl new file mode 100644 index 0000000000000000000000000000000000000000..0b3192edf52a462c7db29c514f196da16ef6576f --- /dev/null +++ b/.renku/workflow/6530d4b63e1045cb9b5af1e1d728cb44_python.cwl @@ -0,0 +1,51 @@ +arguments: [] +baseCommand: +- python +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: + class: File + path: ../../src/python/run_correctxml.py + inputBinding: + position: 1 + separate: true + shellQuote: true + streamable: false + type: File + input_2: + default: + class: File + path: ../../data/AB/1977/02_extractedxml.tar.gz + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: File + input_3: + default: data/AB/1977/04_correctedxml.tar.gz + inputBinding: + position: 3 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_3) + streamable: false + type: File +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/AB/1977 + writable: true +successCodes: [] +temporaryFailCodes: [] diff --git a/Dockerfile b/Dockerfile index eccc0209bce183e39ef458c2b5f2af992f65b8f5..e51294d61b2799d50d4b7b46b7c659fa8e3edf17 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,4 +16,7 @@ RUN sudo apt-get install -y vim # install spacy models RUN python -m spacy download de_core_news_sm RUN python -m spacy download fr_core_news_sm -RUN python -m spacy download xx_ent_wiki_sm \ No newline at end of file +RUN python -m spacy download xx_ent_wiki_sm + +# install nltk stopwords and punkt +RUN python -m nltk.downloader punkt && python -m nltk.downloader stopwords \ No newline at end of file diff --git a/data/AB/1977/04_correctedxml.tar.gz b/data/AB/1977/04_correctedxml.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..b5af7fecd87275edc8be840fd1dc4c2d6317ccd6 --- /dev/null +++ b/data/AB/1977/04_correctedxml.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5097c7c4a8b40861290f1bcb828d93163c0ac3a9b6955c257936a6540d987c48 +size 22497388 diff --git a/src/python/run_correctxml.py b/src/python/run_correctxml.py index ff0710cb6d532daa9e60fb5a5d9e5c25923e7507..254909fbd8462363e598c27b0e68c77adeb7c123 100644 --- a/src/python/run_correctxml.py +++ b/src/python/run_correctxml.py @@ -34,6 +34,13 @@ folder_database = input_file.split(year_tocomp)[0] t1 = time.time() name_tar_file = input_file.split('/')[-1].split('.tar.gz')[0] +print(input_file) +comm = 'git lfs pull -I ' + input_file +utils_proc.call_with_out(comm) +input_file_pdf = '/'.join(input_file.split('/')[:-1]) + '/00_rawpdfs.tar.gz' +comm = 'git lfs pull -I ' + input_file_pdf +utils_proc.call_with_out(comm) + files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file) list_proc = list() @@ -44,12 +51,12 @@ for infile in files_proc: if infile_aux not in list_proc: list_proc.append(infile_aux) d1 = defc.Document(infile_aux, folder_database) - try : - d1.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, name_outxml = name_tar_file, - name_outcorrxml = name_tar_out) + #try : + d1.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, name_outxml = name_tar_file, + name_outcorrxml = name_tar_out) #print('Corrected %s' % infile) - except: - print("File to correct %s prompted an error" % infile) + #except: + # print("File to correct %s prompted an error" % infile) # Commands to get the compressed version of the file #data/AB/${year}/02_extractedxml.tar.gz diff --git a/src/sh/execute_per_year_isolation.sh b/src/sh/execute_per_year_isolation.sh index cd3f676b994f05672484e300931c3c452317003a..1533a900599c21ed0e51a87c1e066033d61ed04f 100755 --- a/src/sh/execute_per_year_isolation.sh +++ b/src/sh/execute_per_year_isolation.sh @@ -12,5 +12,6 @@ for year in $(seq $year_start $year_end) do echo $year $CONDA_DIR/envs/renku/bin/renku run --isolation python $1 ${2}/$year/${3}.tar.gz ${2}/$year/${4}.tar.gz + git push done