From 578e98cd5e6fd91239d7e14457edd6a493f2443c Mon Sep 17 00:00:00 2001 From: tricktx Date: Mon, 7 Oct 2024 11:24:24 -0300 Subject: [PATCH] add headers in check if url is valid --- pipelines/utils/crawler_camara_dados_abertos/constants.py | 4 ++++ pipelines/utils/crawler_camara_dados_abertos/tasks.py | 8 +++++--- pipelines/utils/crawler_camara_dados_abertos/utils.py | 8 +++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pipelines/utils/crawler_camara_dados_abertos/constants.py b/pipelines/utils/crawler_camara_dados_abertos/constants.py index 7c4fa79a5..3383c1760 100755 --- a/pipelines/utils/crawler_camara_dados_abertos/constants.py +++ b/pipelines/utils/crawler_camara_dados_abertos/constants.py @@ -14,6 +14,10 @@ class constants(Enum): ANO_ATUAL = (datetime.now()).year ANO_ANTERIOR = (ANO_ATUAL - 1) + HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + } + TABLES_INPUT_PATH = { # ! - > Proposição "proposicao_microdados": f"/tmp/input/proposicoes-{ANO_ATUAL}.csv", diff --git a/pipelines/utils/crawler_camara_dados_abertos/tasks.py b/pipelines/utils/crawler_camara_dados_abertos/tasks.py index 76d00e79b..7c51ccd87 100755 --- a/pipelines/utils/crawler_camara_dados_abertos/tasks.py +++ b/pipelines/utils/crawler_camara_dados_abertos/tasks.py @@ -24,7 +24,7 @@ def save_data(table_id: str) -> str: df = download_and_read_data(table_id) if not os.path.exists(f'{constants_camara.OUTPUT_PATH.value}{table_id}'): os.makedirs(f'{constants_camara.OUTPUT_PATH.value}{table_id}') - log(f'testando : {constants_camara.OUTPUT_PATH.value}{table_id}') + output_path = constants_camara.TABLES_OUTPUT_PATH.value[table_id] if table_id == "proposicao_microdados": @@ -74,11 +74,13 @@ def save_data(table_id: str) -> str: retry_delay=timedelta(seconds=constants.TASK_RETRY_DELAY.value), ) def check_if_url_is_valid(table_id:str) -> bool: - if requests.get(constants_camara.TABLES_URL.value[table_id]).status_code == 200: + if requests.get(constants_camara.TABLES_URL.value[table_id], headers=constants_camara.HEADERS.value).status_code == 200: log("URL is valid") + log(constants_camara.TABLES_URL.value[table_id]) return True - elif requests.get(constants_camara.TABLES_URL_ANO_ANTERIOR.value[table_id]).status_code == 200: + elif requests.get(constants_camara.TABLES_URL_ANO_ANTERIOR.value[table_id], headers=constants_camara.HEADERS.value).status_code == 200: log("Table is not available in the current year only in the previous year") + log(constants_camara.TABLES_URL_ANO_ANTERIOR.value[table_id]) return False else: raise ValueError("URL is not valid") diff --git a/pipelines/utils/crawler_camara_dados_abertos/utils.py b/pipelines/utils/crawler_camara_dados_abertos/utils.py index 6630f701e..3285f3467 100755 --- a/pipelines/utils/crawler_camara_dados_abertos/utils.py +++ b/pipelines/utils/crawler_camara_dados_abertos/utils.py @@ -11,6 +11,7 @@ # ----------------------------------------------------------------------------------- > Universal def download_table_despesa(table_id:str) -> None: + http_response = urlopen(constants_camara.TABLES_URL.value[table_id]) zipfile = ZipFile(BytesIO(http_response.read())) zipfile.extractall(path=constants_camara.INPUT_PATH.value) @@ -34,13 +35,10 @@ def download_all_table(table_id: str) -> None: url = constants_camara.TABLES_URL.value[table_id] input_path = constants_camara.TABLES_INPUT_PATH.value[table_id] - headers = { - "Content-Type": "application/json;charset=UTF-8", - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)" - } + log(f"Downloading {table_id} from {url}") - response = requests.get(url, headers=headers) + response = requests.get(url, headers=constants_camara.HEADERS.value) if response.status_code == 200: with open(input_path, "wb") as f: f.write(response.content)