-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathlease_dataframe.py
264 lines (222 loc) · 10.6 KB
/
lease_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
from dotenv import load_dotenv, find_dotenv
from functions.dataframe_utils import *
from functions.geocoding_utils import *
from functions.mls_image_processing_utils import *
from functions.noise_level_utils import *
from functions.popup_utils import *
from functions.webscraping_utils import *
from geopy.geocoders import GoogleV3
from imagekitio import ImageKit
from loguru import logger
import geopandas as gpd
import glob
import os
import pandas as pd
import sys
## SETUP AND VARIABLES
load_dotenv(find_dotenv())
g = GoogleV3(api_key=os.getenv('GOOGLE_API_KEY')) # https://github.com/geopy/geopy/issues/171
logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO")
# ImageKit.IO
# https://docs.imagekit.io/api-reference/upload-file-api/server-side-file-upload#uploading-file-via-url
# Create keys at https://imagekit.io/dashboard/developer/api-keys
imagekit = ImageKit(
public_key=os.getenv('IMAGEKIT_PUBLIC_KEY'),
private_key=os.getenv('IMAGEKIT_PRIVATE_KEY'),
url_endpoint = os.getenv('IMAGEKIT_URL_ENDPOINT')
)
# Make the dataframe a global variable
global df
### PANDAS DATAFRAME OPERATIONS
# Load all CSVs and concat into one dataframe
# https://stackoverflow.com/a/21232849
path = "."
all_files = glob.glob(os.path.join(path, "*Renter*.csv"))
df = pd.concat((pd.read_csv(f, float_precision="round_trip", skipinitialspace=True) for f in all_files), ignore_index=True)
pd.set_option("display.precision", 10)
# Strip leading and trailing whitespaces from the column names
# https://stackoverflow.com/a/36082588
df.columns = df.columns.str.strip()
# Convert all column names to lowercase
df.columns = df.columns.str.lower()
# Standardize the column names by renaming them
# https://stackoverflow.com/a/65332240
# Define a renaming dictionary with exact matches
rename_dict = {
'# prking spaces': 'parking_spaces',
'address': 'street_name',
'baths(fthq)': 'bathrooms',
'br': 'bedrooms',
'city': 'city',
'furnished': 'furnished',
'key deposit': 'key_deposit',
'laundry': 'laundry',
'lease terms': 'terms',
'lot sz': 'lot_size',
'lp $/sqft': 'ppsqft',
'lp': 'list_price',
'mls': 'mls_number',
'other deposit': 'other_deposit',
'pet deposit': 'pet_deposit',
'pets': 'pet_policy',
'prop subtype': 'subtype',
'security deposit': 'security_deposit',
'sqft': 'sqft',
'st #': 'street_number',
'yb': 'year_built',
'zip': 'zip_code',
"seller's agent 1 cell": 'phone_number',
}
# Rename columns based on exact matches
df = df.rename(columns=rename_dict)
# Drop the numbers in the first group of characters in the street_name column
df['street_name'] = df['street_name'].str.replace(r'^\d+\s*', '', regex=True)
# Drop all rows with misc/irrelevant data
df.dropna(subset=['street_name'], inplace=True)
# Columns to clean
cols = ['key_deposit', 'other_deposit', 'security_deposit', 'list_price', 'pet_deposit']
# Remove all non-numeric characters, convert to numeric, and round to integers
numeric_cleaned = (
df[cols]
.replace({r'\$': '', ',': ''}, regex=True)
.apply(pd.to_numeric, errors='coerce')
.round(0)
)
# Identify rows where any value exceeds the UInt16 limit (65,535)
mask = (numeric_cleaned > 65535).any(axis=1)
if mask.any():
# Join the MLS number column so that we log which MLS is affected,
# then log the numeric columns that exceed the limit.
culprit_rows = df.loc[mask, ['mls_number']].join(numeric_cleaned.loc[mask, :])
logger.warning("Rows with values exceeding the UInt16 limit were found:")
logger.warning(culprit_rows)
# Cast each column individually:
# If all values in a column are <= 65,535, use UInt16
for col in cols:
max_val = numeric_cleaned[col].max(skipna=True)
if pd.notna(max_val) and max_val <= 65535:
df[col] = numeric_cleaned[col].astype(pd.UInt16Dtype())
else:
logger.error(f"Column '{col}' has values exceeding the UInt16 limit and cannot be cast to UInt16. Aborting.")
sys.exit(1)
# Cast 'sqft' to UInt32
df['sqft'] = df['sqft'].replace({',': ''}, regex=True).astype(pd.UInt32Dtype())
# Convert other columns to appropriate data types
df = df.astype({
'year_built': 'UInt16',
'parking_spaces': 'UInt8',
'street_number': 'string'
})
# Handle lot_size column separately by removing commas, converting to numeric, and then to UInt32
df['lot_size'] = (
df['lot_size']
.replace({',': ''}, regex=True)
.apply(pd.to_numeric, errors='coerce')
.astype(pd.UInt32Dtype())
)
# Cast the following columns as a float and remove the leading $ sign
df['ppsqft'] = df['ppsqft'].replace(to_replace=r'[^\d]', value='', regex=True).astype(pd.Float32Dtype())
# Columns to be cast as strings
cols = ['mls_number', 'phone_number', 'street_name', 'zip_code', 'city']
df[cols] = df[cols].astype(pd.StringDtype())
# Columns to be cast as categories
cols = ['pet_policy', 'furnished', 'subtype', 'terms', 'laundry']
df[cols] = df[cols].astype(pd.CategoricalDtype())
# Extract total bathrooms and bathroom types (Full, Three-Quarter, Half, Quarter)
df[['total_bathrooms', 'full_bathrooms', 'three_quarter_bathrooms', 'half_bathrooms', 'quarter_bathrooms']] = df['bathrooms'].str.extract(r'(\d+\.\d+)\s\((\d+)\s(\d+)\s(\d+)\s(\d+)\)').astype(float)
# Convert bathroom columns to nullable integer type
for col in ['total_bathrooms', 'full_bathrooms', 'three_quarter_bathrooms', 'half_bathrooms', 'quarter_bathrooms']:
df[col] = df[col].astype(pd.UInt8Dtype())
# Drop the original 'Baths(FTHQ)' column since we've extracted the data we need
df.drop(columns=['bathrooms'], inplace=True)
# Convert bedrooms to nullable integer type
df['bedrooms'] = df['bedrooms'].astype(pd.UInt8Dtype())
# Fetch missing city names
for row in df.loc[(df['city'].isnull()) & (df['zip_code'].notnull())].itertuples():
df.at[row.Index, 'city'] = fetch_missing_city(f"{row.street_number} {row.street_name} {str(row.zip_code)}", geolocator=g)
# Create a new column with the Street Number & Street Name
df["short_address"] = (df["street_number"].astype(str) + ' ' + df["street_name"] + ', ' + df['city']).astype(pd.StringDtype())
# Fetch missing zip codes
df = fetch_missing_zip_codes(df, geolocator=g)
df['zip_code'] = df['zip_code'].astype(pd.StringDtype())
# Remove the trailing .0 in the zip_code column
df['zip_code'] = df['zip_code'].str.replace(r'\.0$', '', regex=True)
# Tag each row with the date it was processed
for row in df.itertuples():
df.at[row.Index, 'date_processed'] = pd.Timestamp.today()
# Create a new column with the full street address
df["full_street_address"] = (
df["street_number"].astype(str) + ' ' +
df["street_name"].str.strip() + ', ' +
df['city'] + ' ' +
df["zip_code"].astype(str)
).astype(pd.StringDtype())
# Iterate through the dataframe and get the listed date and photo for rows
df = update_dataframe_with_listing_data(df, imagekit_instance=imagekit)
# Iterate through the dataframe and fetch coordinates for rows
for row in df.itertuples():
coordinates = return_coordinates(address=row.full_street_address, row_index=row.Index, geolocator=g, total_rows=len(df))
df.at[row.Index, 'latitude'] = coordinates[0]
df.at[row.Index, 'longitude'] = coordinates[1]
# These columns should stay floats
df['latitude'] = df['latitude'].apply(pd.to_numeric, errors='raise', downcast='float')
df['longitude'] = df['longitude'].apply(pd.to_numeric, errors='raise', downcast='float')
## Laundry Features ##
# Replace all empty values in the following column with "Unknown" and cast the column as dtype string
df.laundry = df.laundry.astype("string").replace(r'^\s*$', "Unknown", regex=True)
# Fill in any NaNs in the Laundry column with "Unknown"
df.laundry = df.laundry.fillna(value="Unknown")
# Replace various patterns in the Laundry column with "Community Laundry"
df.laundry = df.laundry.str.replace(
r'Community Laundry Area|Laundry Area|Community|Common',
'Community Laundry',
regex=True
)
# Flatten the subtype column
df = flatten_subtype_column(df)
# Convert the listed date into DateTime and use the "mixed" format to handle the different date formats
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
df['listed_date'] = pd.to_datetime(df['listed_date'], errors='raise', format='mixed')
# Convert date_processed into DateTime
df['date_processed'] = pd.to_datetime(df['date_processed'], errors='coerce', format='%Y-%m-%d')
# Reindex the dataframe
df.reset_index(drop=True, inplace=True)
# Do another pass to convert the date_processed column to datetime64 dtype
df['date_processed'] = pd.to_datetime(df['date_processed'], errors='coerce', format='%Y-%m-%d')
# Add pageType context using vectorized operations to each feature's properties to pass through to the onEachFeature JavaScript function
df['context'] = [{"pageType": "lease"} for _ in range(len(df))]
# Save the dataframe for later ingestion by app.py
# Read in the old dataframe
df_old = gpd.read_file(filename='https://github.com/perfectly-preserved-pie/larentals/raw/master/assets/datasets/lease.geojson')
# Combine both old and new dataframes
df_combined = pd.concat([df, df_old], ignore_index=True)
# Drop any dupes again
df_combined = df_combined.drop_duplicates(subset=['mls_number'], keep="last")
# Iterate through the dataframe and drop rows with expired listings
df_combined = remove_inactive_listings(df_combined)
# Categorize the laundry features
df_combined['laundry_category'] = df_combined['laundry'].apply(categorize_laundry_features)
# Reset the index
df_combined = df_combined.reset_index(drop=True)
# Remove trailing 0 in the street_number column and the full_street_address column and the short_address column
df_combined['street_number'] = df_combined['street_number'].str.replace(r'\.0', '', regex=True)
df_combined['full_street_address'] = df_combined['full_street_address'].str.replace(r'\.0', '', regex=True)
df_combined['short_address'] = df_combined['short_address'].str.replace(r'\.0', '', regex=True)
# Save the new combined dataframe
# Convert the combined DataFrame to a GeoDataFrame
gdf_combined = gpd.GeoDataFrame(
df_combined,
geometry=gpd.points_from_xy(df_combined.longitude, df_combined.latitude)
)
# Re-geocode rows where latitude is above a certain threshold
gdf_combined = re_geocode_above_lat_threshold(gdf_combined, geolocator=g)
# Drop some columns that are no longer needed
reduce_geojson_columns(gdf=gdf_combined)
# Save the GeoDataFrame as a GeoJSON file
try:
gdf_combined.to_file("assets/datasets/lease.geojson", driver="GeoJSON")
logger.info("Saved the combined GeoDataFrame to a GeoJSON file.")
except Exception as e:
logger.error(f"Error saving the combined GeoDataFrame to a GeoJSON file: {e}")
# Reclaim space in ImageKit
#reclaim_imagekit_space(geojson_path="assets/datasets/lease.geojson", imagekit_instance=imagekit)