From 9353f369a696a9997271738f935fd13431e807c4 Mon Sep 17 00:00:00 2001 From: Giulio Date: Sat, 14 Sep 2024 10:59:44 -0300 Subject: [PATCH] Atualiza Apache Tika e corrige build corrompido MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Versões mais novas do Tika exigem o "Accept: text/plain" para retornar apenas o conteúdo textual, pois o padrão é retornar HTML. --- data_extraction/text_extraction.py | 5 ++++- scripts/Dockerfile_apache_tika | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py index 8595aaf..723ca24 100644 --- a/data_extraction/text_extraction.py +++ b/data_extraction/text_extraction.py @@ -25,7 +25,10 @@ def _try_extract_text(self, filepath: str) -> str: if self.is_txt(filepath): return self._return_file_content(filepath) with open(filepath, "rb") as file: - headers = {"Content-Type": self._get_file_type(filepath)} + headers = { + "Content-Type": self._get_file_type(filepath), + "Accept": "text/plain", + } response = requests.put(f"{self._url}/tika", data=file, headers=headers) response.encoding = "UTF-8" return response.text diff --git a/scripts/Dockerfile_apache_tika b/scripts/Dockerfile_apache_tika index 150e67a..3ba2413 100644 --- a/scripts/Dockerfile_apache_tika +++ b/scripts/Dockerfile_apache_tika @@ -6,7 +6,7 @@ RUN adduser --system gazette && \ apt-get clean # install Apache Tika -RUN curl -o /tika-server.jar http://archive.apache.org/dist/tika/tika-server-1.24.1.jar && \ +RUN curl -o /tika-server.jar https://archive.apache.org/dist/tika/2.9.2/tika-server-standard-2.9.2.jar && \ chmod 755 /tika-server.jar USER gazette