From 472db8b6d624c5afb4f61a2bb65d8846c245c60a Mon Sep 17 00:00:00 2001 From: Yair Mendoza Date: Wed, 4 Nov 2020 13:54:20 -0600 Subject: [PATCH 1/2] adding funtion for get average state prices from gasolinamx website --- etl/main.py | 46 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/etl/main.py b/etl/main.py index 457904e..a843e77 100644 --- a/etl/main.py +++ b/etl/main.py @@ -6,6 +6,7 @@ import os import sys +import time import random import requests import traceback @@ -14,6 +15,7 @@ import geopandas as gpd import sqlalchemy as db import xmltodict as x2d +import lxml.html as html from pathlib import Path from datetime import timedelta, datetime from timeit import default_timer as timer @@ -36,6 +38,40 @@ N_STATES = 3 N_ROWS = 2250 +def get_state_prices(): + """ + Attempts to retrieve a dictionary with the states of Mexico and it prices for + gasoline magna, premium and diesel with the following format: + {'name_state':{'magna':18.80, 'premium':19.76, 'diesel':20.98}'name_state_2'...} + website (deleting first any previously existing file) + """ + url = 'http://www.gasolinamx.com/tabla-del-precio-de-la-gasolina-en-mexico' + XPATH_STATES = '//table[@class="table table-bordered table-striped"]/tbody/tr/td/a/text()' + XPATH_PRICES = '//table[@class="table table-bordered table-striped"]/tbody/tr/td/text()' + try: + response = requests.get(url) + r = response.content.decode('utf-8') + parsed = html.fromstring(r) + print('Successful request') + states = parsed.xpath(XPATH_STATES) + prices = parsed.xpath(XPATH_PRICES) + gas_type_prices = [] + + i=0 + print('Creating dictionary...........') + for state in states: + j = i + 1 + k = i + 2 + state = dict(mangna=float(prices[i]), premium=float(prices[j]), diesel=float(prices[k])) + gas_type_prices.append(state) + i += 3 + state_prices = dict(zip(states, gas_type_prices)) + print('Dictionary completed') + + return state_prices + except ValueError as ve: + print('A problem ocurred in the state_prices, try again',ve) + def get_dataset(index): """ @@ -141,7 +177,7 @@ def reverse_geocode(stations_df, geo_gdf): """ Performs reverse geocoding on stations_gdf against the DIVA-GIS data to obtain city and state information - + Returns the stations GeoDataFrame with new columns for city and state """ stations_gdf = gpd.GeoDataFrame(stations_df, geometry=gpd.points_from_xy(stations_df.longitude, stations_df.latitude)).set_crs(epsg=4326) @@ -163,7 +199,7 @@ def get_states_with_most_rows(gdf, n): def transform(stations_df, geo_gdf): """ This function cleans the gas stations dataframe in order to obtain records - with at least one gas type price and correct values + with at least one gas type price and correct values It returns the representation of the cleaned dataframe as a list of dictionaries """ @@ -172,8 +208,8 @@ def transform(stations_df, geo_gdf): stations_complete_data_df = stations_df[stations_df['latitude'].notna() & stations_df['longitude'].notna() & (stations_df['regular_price'].notna() | stations_df['premium_price'].notna() | stations_df['diesel_price'].notna())].copy() - bad_records = stations_complete_data_df[(stations_complete_data_df['regular_price'] <= 1) | - (stations_complete_data_df['diesel_price'] <= 1) | (stations_complete_data_df['premium_price'] <= 1) | + bad_records = stations_complete_data_df[(stations_complete_data_df['regular_price'] <= 1) | + (stations_complete_data_df['diesel_price'] <= 1) | (stations_complete_data_df['premium_price'] <= 1) | (stations_complete_data_df['regular_price'] >= 40) | (stations_complete_data_df['diesel_price'] >= 40) | (stations_complete_data_df['premium_price'] >= 40)] @@ -243,7 +279,7 @@ def load(stations_dict): def run(): """ Entry point for this module - """ + """ raw_stations_df, geo_gdf = extract() clean_stations_dict = transform(raw_stations_df, geo_gdf) From 12187449201a1d8814ed101791c09fbdc5e00b5e Mon Sep 17 00:00:00 2001 From: Yair Mendoza Date: Wed, 4 Nov 2020 13:56:28 -0600 Subject: [PATCH 2/2] adding (cuarto_up.sh ) to .gitignoere --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2800116..c69abf1 100644 --- a/.gitignore +++ b/.gitignore @@ -279,3 +279,4 @@ cride/media/ .env .envs/* !.envs/.local/ +cuarto_up.sh \ No newline at end of file