From 781309c9fcbf5f62ec234ecb86f559023b3da40c Mon Sep 17 00:00:00 2001 From: claireboyd Date: Mon, 6 May 2024 11:40:35 -0500 Subject: [PATCH] fixed error - changed filepaths --- .DS_Store | Bin 0 -> 6148 bytes fa-etl.py | 46 ++++++++------ scratch.ipynb | 167 ++++++++++++++++---------------------------------- 3 files changed, 80 insertions(+), 133 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..2a847af6bbd15ca20ee0f2d56d2b6a75be65a844 GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8O({YS3Oz1(E!cu>1ur4i7cim+m70*C!I&*g+8jzDXMG``#OHBl zcLNr47O^w1`_1oe_JiyXV~o4YaLAa=7_*=ua#U&r-L;{HNk-&2Mm7%;8G!W>%uVdC z1Acpxg)CyLp!oj%Nu1?_!6$DtTYLL$t8I0y2k%)HUhe11%=H&{XkAO01eNXwH*r+V z?86J0=6;+;3sn$@5v1JT#%U-ES1!{qQ?;HBSRJb~vwQ3H_-xb{!|`a-7wgGr*cX$* z>1NZhj*d^xuVydFYbxI~ogCO!a$vB6cTmh~R^B2_Wcmc2D!a-eBnF59Vt^RfZ3fJ_ zV0Cw!23kEaKn&C{fct}hhUge9HL9%xI=nt(yoHDYI=&?kg+a$)sSzR|T$cjsQf{6Y zT$h7im^{Z|sZp0Ru4aaD%*@r}g{#@YFH}0?jz;Q<0b*dAfu=TXJpV7@m#KZ^ZAfRtu0>Hq1q^E*9E>MR&$6%=uM?t?T Q2c(ODB7{0(;1?M90yJ4kdH?_b literal 0 HcmV?d00001 diff --git a/fa-etl.py b/fa-etl.py index e86350e..bbe640a 100644 --- a/fa-etl.py +++ b/fa-etl.py @@ -10,6 +10,13 @@ def mem_profile() -> str: mem_use = str(round(100 - psutil.virtual_memory().percent,4))+'% of '+str(round(psutil.virtual_memory().total/1e+9,3))+' GB RAM' return mem_use +def is_lazydataframe_empty(ldf): + """ + Checks if a polars lazy dataframe is empty given a lazy dataframe. + Returns: boolean (True, False) + """ + return ((ldf.describe().filter(pl.col("statistic") == "count")["PropertyID"])[0] == 0) + def convert_sales(filename, input_dir): ''' Convert zipped txt sales (deed) file into parquet format. @@ -266,6 +273,8 @@ def convert_valhist(filename, input_dir): input_filepath = input_dir + "/raw/" + filename output_dir = input_dir + "/" + "staging" output_filepath = output_dir + "/" + filename.replace(".txt.zip", ".parquet") + output_filepath_temp1 = output_dir + "/rankedtemp1_" + filename.replace(".txt.zip", ".parquet") + output_filepath_temp2 = output_dir + "/rankedtemp2_" + filename.replace(".txt.zip", ".parquet") output_filepath_ranked = output_dir + "/ranked_" + filename.replace(".txt.zip", ".parquet") # check if parquet already exists, if it does, skip @@ -303,7 +312,7 @@ def convert_valhist(filename, input_dir): logging.info(f"{output_filepath} already exists. Moving on...") if not os.path.exists(output_filepath_ranked): - logging.info(f"Creating {output_filepath_ranked}...") + logging.info(f"Creating {output_filepath_temp1}...") #temp filepaths assd_filepath = output_dir+"/assd.parquet" @@ -312,11 +321,7 @@ def convert_valhist(filename, input_dir): logging.info(f"filepaths: {assd_filepath}, {market_filepath} and {appr_filepath}...") if not os.path.exists(assd_filepath) & os.path.exists(market_filepath) & os.path.exists(appr_filepath): - logging.info(f"Creating assd parquet...") - #split val hist into three separate datasets with PropertyID, Year as consistent - ##TODO: this seems super repetitive, but it might be the best option given the size of the dataset. - (pl.scan_parquet(Path(output_filepath), low_memory = True, use_statistics=True, hive_partitioning=True) .with_columns([pl.col('AssdYear').cast(pl.Int64).alias('Year')]) .filter( @@ -342,9 +347,10 @@ def convert_valhist(filename, input_dir): #write checks - make sure there are no duplicates in the above (by propID/year) # if so, raise error and don't proceed - assd = pl.scan_parquet(Path(assd_filepath), low_memory = True, parallel='auto') - appr = pl.scan_parquet(Path(appr_filepath), low_memory = True, parallel='auto') - market = pl.scan_parquet(Path(market_filepath), low_memory = True, parallel='auto') + + assd = pl.scan_parquet(Path(assd_filepath), low_memory = True) + appr = pl.scan_parquet(Path(appr_filepath), low_memory = True) + market = pl.scan_parquet(Path(market_filepath), low_memory = True) logging.info(f"Joining assessed values and market values on propid/year...") # join with market data @@ -352,40 +358,40 @@ def convert_valhist(filename, input_dir): other=market, how="left", on=['PropertyID', 'Year'], - ).sink_parquet(Path(output_filepath_ranked), compression="snappy") - + ).sink_parquet(Path(output_filepath_temp1), compression="snappy") logging.info(f"val/market join on propid/year complete. Starting second join...") + rankedtemp1_valhist = pl.scan_parquet(Path(output_filepath_temp1), low_memory = True) + logging.info(f"is ranked_valhist empty? {is_lazydataframe_empty(rankedtemp1_valhist)}") + # check if the length of the output of a ldf is 0 (aka dataframe is empty) logging.info(f"Check if appraisal dataframe is empty...") - if (appr.describe().filter(pl.col("statistic") == "count")["PropertyID"])[0] != 0: + if not is_lazydataframe_empty(appr): logging.info(f"Appraisal dataframe is not empty! Joining with val/market...") - - (pl.scan_parquet(Path(output_filepath_ranked), low_memory = True, parallel='row_groups', use_statistics=False, hive_partitioning=False) + (rankedtemp1_valhist # # join with appr data ).join( other=appr, how="left", on=['PropertyID', 'Year'], ).sink_parquet( - Path(output_filepath_ranked), + Path(output_filepath_temp2), compression="snappy" ) - else: + else: logging.info(f"Appraisal dataframe is empty! Adding a col of nulls for appraisal col...") - (pl.scan_parquet(Path(output_filepath_ranked), low_memory = True, parallel='row_groups', use_statistics=False, hive_partitioning=False) - # # join with appr data + (rankedtemp1_valhist + # add col of nulls for ApprTotalValue because not present for any PropIDs ).with_columns([ - # add col of nulls for ApprTotalValue because not present for any PropIDs pl.when(True).then(None).alias("ApprTotalValue") ]).sink_parquet( - Path(output_filepath_ranked), + Path(output_filepath_temp2), compression="snappy" ) logging.info(f"val/market/appr join on propid/year complete. Doing with_cols operations...") - (pl.scan_parquet(Path(output_filepath_ranked), low_memory = True, parallel='row_groups', use_statistics=False, hive_partitioning=False) + (pl.scan_parquet(Path(output_filepath_temp2), low_memory = True) .with_columns([ #value conditional pl.when((pl.col("AssdTotalValue").is_not_null()) & (pl.col("AssdTotalValue") != 0)) diff --git a/scratch.ipynb b/scratch.ipynb index efd9477..d44198f 100644 --- a/scratch.ipynb +++ b/scratch.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "#set up autoreload\n", "%load_ext autoreload\n", @@ -28,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -49,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -57,184 +48,134 @@ " other=market,\n", " how=\"left\",\n", " on=['PropertyID', 'Year'],\n", - ").sink_parquet(Path(output_filepath_ranked), compression=\"snappy\")\n" + ").sink_parquet(Path(output_filepath_ranked), compression=\"snappy\")" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "valhist = pl.read_parquet(path_to_dir+f\"dev/{county}/staging/ranked_ValHist{county}.parquet\")" + "ranked_valhist = pl.read_parquet(path_to_dir+f\"dev/{county}/staging/ranked_ValHist{county}.parquet\")" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(3566623, 4)" + "False" ] }, - "execution_count": 72, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "valhist.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "lf = pl.LazyFrame(\n", - " {\n", - " \"a\": [1, 2, 3, 4],\n", - " \"b\": [0.5, 4, 10, 13],\n", - " \"c\": [True, True, False, True],\n", - " }\n", - ")" + "fa_etl.is_lazydataframe_empty(path_to_dir+f\"dev/{county}/staging/ranked_ValHist{county}.parquet\")" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "shape: (4, 4)
abcAppr
i64f64boolnull
10.5truenull
24.0truenull
310.0falsenull
413.0truenull
" - ], "text/plain": [ - "shape: (4, 4)\n", - "┌─────┬──────┬───────┬──────┐\n", - "│ a ┆ b ┆ c ┆ Appr │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ f64 ┆ bool ┆ null │\n", - "╞═════╪══════╪═══════╪══════╡\n", - "│ 1 ┆ 0.5 ┆ true ┆ null │\n", - "│ 2 ┆ 4.0 ┆ true ┆ null │\n", - "│ 3 ┆ 10.0 ┆ false ┆ null │\n", - "│ 4 ┆ 13.0 ┆ true ┆ null │\n", - "└─────┴──────┴───────┴──────┘" + "(3566623, 4)" ] }, - "execution_count": 56, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "lf.with_columns([\n", - " pl.when(True).then(None).alias(\"ApprTotalValue\")\n", - "]).collect()" + "ranked_valhist.shape" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "shape: (0, 16)
PropertyIDSaleAmtRecordingDateFIPSFATimeStampFATransactionIDTransactionTypeSaleDateRecordingYearSlicePropertyID_strFATransactionID_1RecordingYearSaleYearFATimeStampYearSaleFlagPropIDFlag
i64i64datestrdatei64strdatestrstrstri64i32i32i32i32
" - ], "text/plain": [ - "shape: (0, 16)\n", - "┌────────────┬─────────┬──────────────┬──────┬───┬──────────┬──────────────┬──────────┬────────────┐\n", - "│ PropertyID ┆ SaleAmt ┆ RecordingDat ┆ FIPS ┆ … ┆ SaleYear ┆ FATimeStampY ┆ SaleFlag ┆ PropIDFlag │\n", - "│ --- ┆ --- ┆ e ┆ --- ┆ ┆ --- ┆ ear ┆ --- ┆ --- │\n", - "│ i64 ┆ i64 ┆ --- ┆ str ┆ ┆ i32 ┆ --- ┆ i32 ┆ i32 │\n", - "│ ┆ ┆ date ┆ ┆ ┆ ┆ i32 ┆ ┆ │\n", - "╞════════════╪═════════╪══════════════╪══════╪═══╪══════════╪══════════════╪══════════╪════════════╡\n", - "└────────────┴─────────┴──────────────┴──────┴───┴──────────┴──────────────┴──────────┴────────────┘" + "OrderedDict([('PropertyID', Int64),\n", + " ('AssdTotalValue', Int64),\n", + " ('AssdYear', Int64),\n", + " ('MarketTotalValue', Int64),\n", + " ('MarketValueYear', Int64),\n", + " ('ApprTotalValue', Int64),\n", + " ('ApprYear', Int64),\n", + " ('TaxableYear', Int64)])" ] }, - "execution_count": 12, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sales.filter(\n", - " pl.col(\"PropIDFlag\") == 1\n", - ")" + "valhist.schema" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "merged = pl.read_parquet(path_to_dir+f\"dev/{county}/unified/merged.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "shape: (44_004, 8)
PropertyIDYearValueAssessmentUsedSaleAmtTaxAmtTaxAmtAdjustedApproxTaxRate
i64i64i64stri64i64f64f64
918478702018117677"Assd"1580000131042413104.2411.13577
918490082018247162"Assd"4400000311405231140.5212.599235
91848046201871155"Assd"7127757733527733.5210.868555
918465492018149310"Assd"2475000152737215273.7210.229536
918475372018128574"Assd"1625000149614814961.4811.636474
918483332018356895"Assd"7400000395877639587.7611.092271
918466492018219589"Assd"3100000251893225189.3211.471121
918472152018299300"Assd"2825000332703633270.3611.116057
918461342022718067"Assd"12500000880852888085.2812.267
918472892018143322"Assd"2840000172320017232.012.023276
91848235201898891"Assd"840000107484410748.4410.868977
91847234201897929"Assd"2000000113812411381.2411.62193
91951526202043461"Assd"280000nullnullnull
919503282020791550"Assd"12000000nullnullnull
919514972023388800"Assd"2613000486077848607.7812.502001
919514942023423000"Assd"2321000528834652883.4612.502
919514992023376200"Assd"2566000470325247032.5212.501999
919515052023418950"Assd"1942000523771352377.1312.502
919515082023418950"Assd"2795000523771352377.1312.502
91950049202388208"Assd"580000110277611027.7612.501995
91950055202390672"Assd"570000113358111335.8112.501996
91949993202330010"Assd"5500003751853751.8512.501999
91950231202330130"Assd"46013513766853766.8512.501991
91951304202336494"Assd"9715627048637048.6319.31449
" + "shape: (5, 17)
PropertyIDYearValueMarketTotalValueApprTotalValueSitusLatitudeSitusLongitudeSitusFullStreetAddressSitusCitySitusStateSitusZIP5FIPSSitusCensusTractSitusCensusBlockSaleAmtTaxAmtTaxAmtAdjusted
i64i64i64i64nullf64f64strstrstrstrstrstrstri64i64f64
918478702018117677261504null40.750965-73.982198"425 5TH AVE AP…"NEW YORK""NY""10016""36061""008200""2004"1580000131042413104.24
9184900820182471621462000null40.750447-73.997723"362 W 30TH ST""NEW YORK""NY""10001""36061""009700""4000"4400000311405231140.52
91848046201871155158123null40.738643-73.987954"254 PARK AVE S…"NEW YORK""NY""10010""36061""005200""1000"7127757733527733.52
918465492018149310331800null40.729262-74.004785"63 DOWNING ST …"NEW YORK""NY""10014""36061""006700""2002"2475000152737215273.72
918475372018128574285721null40.73889-73.987904"260 PARK AVE S…"NEW YORK""NY""10010""36061""005200""1000"1625000149614814961.48
" ], "text/plain": [ - "shape: (44_004, 8)\n", - "┌────────────┬──────┬────────┬────────────────┬─────────┬─────────┬────────────────┬───────────────┐\n", - "│ PropertyID ┆ Year ┆ Value ┆ AssessmentUsed ┆ SaleAmt ┆ TaxAmt ┆ TaxAmtAdjusted ┆ ApproxTaxRate │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ f64 ┆ f64 │\n", - "╞════════════╪══════╪════════╪════════════════╪═════════╪═════════╪════════════════╪═══════════════╡\n", - "│ 91847870 ┆ 2018 ┆ 117677 ┆ Assd ┆ 1580000 ┆ 1310424 ┆ 13104.24 ┆ 11.13577 │\n", - "│ 91849008 ┆ 2018 ┆ 247162 ┆ Assd ┆ 4400000 ┆ 3114052 ┆ 31140.52 ┆ 12.599235 │\n", - "│ 91848046 ┆ 2018 ┆ 71155 ┆ Assd ┆ 712775 ┆ 773352 ┆ 7733.52 ┆ 10.868555 │\n", - "│ 91846549 ┆ 2018 ┆ 149310 ┆ Assd ┆ 2475000 ┆ 1527372 ┆ 15273.72 ┆ 10.229536 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 91950055 ┆ 2023 ┆ 90672 ┆ Assd ┆ 570000 ┆ 1133581 ┆ 11335.81 ┆ 12.501996 │\n", - "│ 91949993 ┆ 2023 ┆ 30010 ┆ Assd ┆ 550000 ┆ 375185 ┆ 3751.85 ┆ 12.501999 │\n", - "│ 91950231 ┆ 2023 ┆ 30130 ┆ Assd ┆ 4601351 ┆ 376685 ┆ 3766.85 ┆ 12.501991 │\n", - "│ 91951304 ┆ 2023 ┆ 36494 ┆ Assd ┆ 971562 ┆ 704863 ┆ 7048.63 ┆ 19.31449 │\n", - "└────────────┴──────┴────────┴────────────────┴─────────┴─────────┴────────────────┴───────────────┘" + "shape: (5, 17)\n", + "┌────────────┬──────┬────────┬───────────────┬───┬──────────────┬─────────┬─────────┬──────────────┐\n", + "│ PropertyID ┆ Year ┆ Value ┆ MarketTotalVa ┆ … ┆ SitusCensusB ┆ SaleAmt ┆ TaxAmt ┆ TaxAmtAdjust │\n", + "│ --- ┆ --- ┆ --- ┆ lue ┆ ┆ lock ┆ --- ┆ --- ┆ ed │\n", + "│ i64 ┆ i64 ┆ i64 ┆ --- ┆ ┆ --- ┆ i64 ┆ i64 ┆ --- │\n", + "│ ┆ ┆ ┆ i64 ┆ ┆ str ┆ ┆ ┆ f64 │\n", + "╞════════════╪══════╪════════╪═══════════════╪═══╪══════════════╪═════════╪═════════╪══════════════╡\n", + "│ 91847870 ┆ 2018 ┆ 117677 ┆ 261504 ┆ … ┆ 2004 ┆ 1580000 ┆ 1310424 ┆ 13104.24 │\n", + "│ 91849008 ┆ 2018 ┆ 247162 ┆ 1462000 ┆ … ┆ 4000 ┆ 4400000 ┆ 3114052 ┆ 31140.52 │\n", + "│ 91848046 ┆ 2018 ┆ 71155 ┆ 158123 ┆ … ┆ 1000 ┆ 712775 ┆ 773352 ┆ 7733.52 │\n", + "│ 91846549 ┆ 2018 ┆ 149310 ┆ 331800 ┆ … ┆ 2002 ┆ 2475000 ┆ 1527372 ┆ 15273.72 │\n", + "│ 91847537 ┆ 2018 ┆ 128574 ┆ 285721 ┆ … ┆ 1000 ┆ 1625000 ┆ 1496148 ┆ 14961.48 │\n", + "└────────────┴──────┴────────┴───────────────┴───┴──────────────┴─────────┴─────────┴──────────────┘" ] }, - "execution_count": 12, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "merged.filter([\n", - " pl.col(\"SaleAmt\").is_not_null(),\n", - " pl.col(\"Year\") > 2017\n", - "]).select(\n", - " ['PropertyID','Year','Value','AssessmentUsed','SaleAmt', 'TaxAmt', 'TaxAmtAdjusted']\n", - ").with_columns([\n", - " (pl.col('TaxAmt')/pl.col('Value')).alias(\"ApproxTaxRate\")\n", - "])" + "merged.head()" ] } ],