-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbuy_dataframe.py
294 lines (249 loc) · 12.4 KB
/
buy_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
from dotenv import load_dotenv, find_dotenv
from functions.dataframe_utils import *
from functions.geocoding_utils import *
from functions.mls_image_processing_utils import *
from functions.noise_level_utils import *
from functions.popup_utils import *
from geopy.geocoders import GoogleV3
from imagekitio import ImageKit
from loguru import logger
import glob
import os
import pandas as pd
import sys
## SETUP AND VARIABLES
load_dotenv(find_dotenv())
g = GoogleV3(api_key=os.getenv('GOOGLE_API_KEY')) # https://github.com/geopy/geopy/issues/171
logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO")
# ImageKit.IO
# https://docs.imagekit.io/api-reference/upload-file-api/server-side-file-upload#uploading-file-via-url
# Create keys at https://imagekit.io/dashboard/developer/api-keys
imagekit = ImageKit(
public_key=os.getenv('IMAGEKIT_PUBLIC_KEY'),
private_key=os.getenv('IMAGEKIT_PRIVATE_KEY'),
url_endpoint = os.getenv('IMAGEKIT_URL_ENDPOINT')
)
# Make the dataframe a global variable
global df
### PANDAS DATAFRAME OPERATIONS
# Read all sheets from the Excel file
excel_file = glob.glob('*.xlsx')[0]
xlsx = pd.read_excel(excel_file, sheet_name=None)
## Remember: N/A = Not Applicable to this home type while NaN = Unknown for some reason (missing data)
# Assuming first sheet is single-family homes, second sheet is condos/townhouses, and third sheet is mobile homes
# Set the subtype of every row in the first sheet to "Single Family Residence"
xlsx[list(xlsx.keys())[0]]["subtype"] = "Single Family Residence"
# Merge all sheets into a single DataFrame
df = pd.concat(xlsx.values())
# Strip leading and trailing whitespaces from the column names and convert them to lowercase
# https://stackoverflow.com/a/36082588
df.columns = df.columns.str.strip().str.lower()
# Initialize an empty dictionary to hold renamed DataFrames
renamed_sheets_corrected = {}
# Using a dictionary for clearer column renaming
specific_column_map = {
'# prking spaces': 'garage_spaces',
'address': 'street_address',
'baths(fthq)': 'bedrooms_bathrooms',
'br': 'bedrooms',
'city': 'city',
'hoa fee freq.1': 'hoa_fee_frequency',
'hoa fees': 'hoa_fee',
'hoa': 'hoa_fee',
'hod': 'hoa_fee',
'list price': 'list_price',
'lot size': 'lot_size',
'lot sz': 'lot_size',
'lp $/sqft': 'ppsqft',
'lp': 'list_price',
'mls': 'mls_number',
'price per sqft': 'ppsqft',
'price per square foot': 'ppsqft',
'prop subtype': 'subtype',
'sqft': 'sqft',
'st #': 'street_number',
'sub type': 'subtype',
'yb': 'year_built',
'year built': 'year_built',
'yr built': 'year_built',
'zip': 'zip_code',
}
# Rename columns using the specific map in a safe manner
for sheet_name, sheet_df in xlsx.items():
original_columns = sheet_df.columns.tolist()
sheet_df.columns = sheet_df.columns.str.strip().str.lower()
# Identify columns that can be renamed
existing_renames = {col: specific_column_map[col] for col in sheet_df.columns if col in specific_column_map}
if existing_renames:
logger.info(f"Renaming columns in sheet '{sheet_name}': {existing_renames}")
else:
logger.warning(f"No columns to rename in sheet '{sheet_name}'.")
# Rename the columns
renamed_sheet_df = sheet_df.rename(columns=existing_renames)
# Add the renamed DataFrame to the dictionary
renamed_sheets_corrected[sheet_name] = renamed_sheet_df
# Then proceed with concatenation
df = pd.concat(renamed_sheets_corrected.values(), ignore_index=True)
# Drop all rows with misc/irrelevant data
df.dropna(subset=['mls_number'], inplace=True)
# Define columns to remove all non-numeric characters from
cols = ['hoa_fee', 'list_price', 'ppsqft', 'sqft', 'year_built', 'lot_size']
# Loop through the columns and remove all non-numeric characters except for the string "N/A"
for col in cols:
df[col] = df[col].apply(lambda x: ''.join(c for c in str(x) if c.isdigit() or c == '.' or str(x) == 'N/A'))
# Reindex the dataframe
df.reset_index(drop=True, inplace=True)
# Fetch missing city names
for row in df.loc[(df['city'].isnull()) & (df['zip_code'].notnull())].itertuples():
df.at[row.Index, 'city'] = fetch_missing_city(f"{row.street_address} {str(row.city)} {str(row.zip_code)}", geolocator=g)
# Cast these columns as strings so we can concatenate them
cols = ['street_number', 'street_address', 'city', 'mls_number']
for col in cols:
df[col] = df[col].astype("string")
# Create a new column with the Street Number & Street Name
df["short_address"] = df["street_address"].str.strip() + ',' + ' ' + df['city']
# Fetch missing zip codes
df = fetch_missing_zip_codes(df, geolocator=g)
df['zip_code'] = df['zip_code'].astype(pd.StringDtype())
# Tag each row with the date it was processed
for row in df.itertuples():
df.at[row.Index, 'date_processed'] = pd.Timestamp.today()
# Create a new column with the full street address
# Also strip whitespace from the St Name column
# Convert the postal code into a string so we can combine string and int
# https://stackoverflow.com/a/11858532
df["full_street_address"] = df["street_address"].str.strip() + ',' + ' ' + df['city'] + ' ' + df["zip_code"].map(str)
# Iterate through the dataframe and get the listed date and photo for rows
df = update_dataframe_with_listing_data(df, imagekit_instance=imagekit)
# Iterate through the dataframe and fetch coordinates for rows
for row in df.itertuples():
coordinates = return_coordinates(address=row.full_street_address, row_index=row.Index, geolocator=g, total_rows=len(df))
df.at[row.Index, 'latitude'] = coordinates[0]
df.at[row.Index, 'longitude'] = coordinates[1]
#df = update_howloud_scores(df)
# Cast HowLoud columns as either nullable strings or nullable integers
#howloud_columns = [col for col in df.columns if col.startswith("howloud_")]
#for col in howloud_columns:
# Check if the content is purely numeric
# if df[col].dropna().astype(str).str.isnumeric().all():
# df[col] = df[col].astype(pd.Int32Dtype()) # Cast to nullable integer
# else:
# df[col] = df[col].astype(pd.StringDtype()) # Cast to string
### BATHROOMS PARSING
# Split the Bedroom/Bathrooms column to extract total and detailed bathroom counts
# Expected format: '1.00 (1 0 0 0)' where:
# - 1.00 is the total bathrooms
# - 1 is full bathrooms
# - 0 is half bathrooms
# - 0 is three-quarter bathrooms
# - 0 is extra bathrooms
# Define a regex pattern to extract total bathrooms and bathroom details inside parentheses
# Expected format: '1.00 (1 0 0 0)'
bathroom_pattern = r'^(\d+\.\d+)\s+\((\d+)\s+(\d+)\s+(\d+)\s+(\d+)\)'
# Extract the numbers: total, full, half, three_quarter, extra
bathroom_details = df['bedrooms_bathrooms'].str.extract(bathroom_pattern)
# Check if extraction was successful
if bathroom_details.isnull().values.any():
logger.warning("Some rows in 'bedrooms_bathrooms' do not match the expected format.")
# Rename the extracted columns
bathroom_details.columns = [
'total_bathrooms',
'full_bathrooms',
'half_bathrooms',
'three_quarter_bathrooms',
'extra_bathrooms'
]
# Convert total_bathrooms to nullable UInt8
bathroom_details['total_bathrooms'] = bathroom_details['total_bathrooms'].astype("float").astype("UInt8")
# Convert to numeric types
bathroom_details = bathroom_details.astype("UInt8")
# Assign the extracted bathrooms to the main DataFrame
df = pd.concat([df, bathroom_details], axis=1)
# If total_bathrooms is missing, fill it with the sum of the detailed bathrooms
df['total_bathrooms'] = df['total_bathrooms'].fillna(df['full_bathrooms'] + df['half_bathrooms'] + df['three_quarter_bathrooms'] + df['extra_bathrooms'])
logger.info("Bathroom columns extracted and total_bathrooms updated.")
logger.debug(df[['bedrooms_bathrooms', 'total_bathrooms', 'full_bathrooms', 'half_bathrooms', 'three_quarter_bathrooms']].sample(n=10))
# Convert a few columns into int64
# pd.to_numeric will convert into int64 or float64 automatically, which is cool
# These columns are assumed to have NO MISSING DATA, so we can cast them as int64 instead of floats (ints can't handle NaNs)
# These columns should stay floats
df['latitude'] = df['latitude'].apply(pd.to_numeric, errors='coerce')
df['longitude'] = df['longitude'].apply(pd.to_numeric, errors='coerce')
# Convert zip_code into string
df['zip_code'] = df['zip_code'].apply(pd.to_numeric, errors='coerce').astype("string")
# Convert the listed date into DateTime and use the "mixed" format to handle the different date formats
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
df['listed_date'] = pd.to_datetime(df['listed_date'], errors='raise', format='mixed')
# Convert date_processed into DateTime
df['date_processed'] = pd.to_datetime(df['date_processed'], errors='raise', format='mixed')
cols = ['full_bathrooms', 'bedrooms', 'year_built', 'sqft', 'list_price', 'total_bathrooms', 'ppsqft', 'hoa_fee', 'bedrooms_bathrooms']
# Convert columns to string type for string operations
df[cols] = df[cols].astype(str)
# Remove commas and other non-numeric characters
df[cols] = df[cols].replace({',': '', r'[^0-9\.]': ''}, regex=True)
# Replace empty strings with Unknown
df[cols] = df[cols].replace('', 'Unknown')
# Convert columns to numeric
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
# Cast specified columns as nullable integers
int_cols = ['full_bathrooms', 'bedrooms', 'year_built', 'sqft', 'list_price', 'total_bathrooms']
df[int_cols] = df[int_cols].astype('Int64')
# Cast these columns as nullable strings
cols = ['short_address', 'full_street_address', 'mls_number', 'mls_photo', 'listing_url', 'subtype', 'bedrooms_bathrooms', 'hoa_fee_frequency']
for col in cols:
df[col] = df[col].astype('string')
# Reindex the dataframe
df.reset_index(drop=True, inplace=True)
# Add pageType context using vectorized operations to each feature's properties to pass through to the onEachFeature JavaScript function
df['context'] = [{"pageType": "buy"} for _ in range(len(df))]
# Save the dataframe for later ingestion by app.py
# Read in the old dataframe
df_old = gpd.read_file(filename='https://github.com/perfectly-preserved-pie/larentals/raw/master/assets/datasets/buy.geojson')
# Combine both old and new dataframes
df_combined = pd.concat([df, df_old], ignore_index=True)
# Drop any dupes again
df_combined = df_combined.drop_duplicates(subset=['mls_number'], keep="last")
# Flatten the subtype column
df_combined = flatten_subtype_column(df_combined)
# Iterate through the dataframe and drop rows with expired listings
df_combined = remove_inactive_listings(df_combined)
# Reset the index
df_combined = df_combined.reset_index(drop=True)
# Remove trailing 0 in the street_number column and the full_street_address column and the short_address column
df_combined['street_number'] = df_combined['street_number'].str.replace(r'\.0', '', regex=True)
df_combined['full_street_address'] = df_combined['full_street_address'].str.replace(r'\.0', '', regex=True)
df_combined['short_address'] = df_combined['short_address'].str.replace(r'\.0', '', regex=True)
# Filter the dataframe for rows outside of California
outside_ca_rows = df_combined[
(df_combined['latitude'] < 32.5) |
(df_combined['latitude'] > 42) |
(df_combined['longitude'] < -124) |
(df_combined['longitude'] > -114)
]
total_outside_ca = len(outside_ca_rows)
counter = 0
for row in outside_ca_rows.itertuples():
counter += 1
logger.warning(f"Row {counter} out of {total_outside_ca}: {row.mls_number} has coordinates {row.latitude}, {row.longitude} which is outside California. Re-geocoding {row.mls_number}...")
# Re-geocode the row
coordinates = return_coordinates(address=row.full_street_address, row_index=row.Index, geolocator=g, total_rows=len(df))
df_combined.at[row.Index, 'latitude'] = coordinates[0]
df_combined.at[row.Index, 'longitude'] = coordinates[1]
# Save the new combined dataframe
# Convert the combined DataFrame to a GeoDataFrame
gdf_combined = gpd.GeoDataFrame(
df_combined,
geometry=gpd.points_from_xy(df_combined.longitude, df_combined.latitude)
)
# Re-geocode rows where latitude is above a certain threshold
gdf_combined = re_geocode_above_lat_threshold(gdf_combined, geolocator=g)
# Drop some columns that are no longer needed
reduce_geojson_columns(gdf=gdf_combined)
# Save the GeoDataFrame as a GeoJSON file
try:
gdf_combined.to_file("assets/datasets/buy.geojson", driver="GeoJSON")
logger.info("Saved the combined GeoDataFrame to a GeoJSON file.")
except Exception as e:
logger.error(f"Error saving the combined GeoDataFrame to a GeoJSON file: {e}")
# Reclaim space in ImageKit
#reclaim_imagekit_space(geojson_path="assets/datasets/buy.geojson", imagekit_instance=imagekit)