-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Update growth stage and growth stage start date in FarmRegistry #407
base: main
Are you sure you want to change the base?
Changes from 1 commit
ee4e815
79ef3fc
8599cdb
54499a1
b98838a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,14 +10,20 @@ | |
import time | ||
import numpy as np | ||
import pandas as pd | ||
from django.db import connection | ||
from django.db import connection, transaction | ||
from django.db.models import Min | ||
import dask.dataframe as dd | ||
from dask.dataframe.core import DataFrame as dask_df | ||
from django.contrib.gis.db.models import Union | ||
from sqlalchemy import create_engine | ||
|
||
from gap.models import FarmRegistryGroup, FarmRegistry, Grid, CropGrowthStage | ||
from gap.models import ( | ||
FarmRegistryGroup, | ||
FarmRegistry, | ||
Grid, | ||
CropGrowthStage, | ||
Farm | ||
) | ||
from dcas.models import DCASConfig, DCASConfigCountry | ||
from dcas.partitions import ( | ||
process_partition_total_gdd, | ||
|
@@ -429,6 +435,71 @@ | |
|
||
self.data_output.save(OutputType.FARM_CROP_DATA, farm_df) | ||
|
||
def update_farm_registry_growth_stage(self): | ||
"""Efficiently update growth stage in FarmRegistry.""" | ||
# Load Data | ||
grid_crop_df = self.data_query.read_grid_data_crop_meta_parquet( | ||
self.data_output.grid_crop_data_dir_path | ||
) | ||
|
||
# Ensure required columns exist | ||
required_columns = { | ||
"grid_id", | ||
"growth_stage_id", | ||
"growth_stage_start_date" | ||
} | ||
missing_columns = required_columns - set(grid_crop_df.columns) | ||
if missing_columns: | ||
raise ValueError( | ||
f"Missing columns in grid_crop_df: {missing_columns}" | ||
) | ||
|
||
# Convert `growth_stage_start_date` to datetime safely | ||
grid_crop_df["growth_stage_start_date"] = pd.to_datetime( | ||
grid_crop_df["growth_stage_start_date"], errors="coerce" | ||
).dt.date # Convert to date format | ||
|
||
# Get mapping of grid_id to farm_id | ||
farm_mapping = { | ||
row["grid_id"]: row["id"] for row in Farm.objects.values( | ||
"id", | ||
"grid_id" | ||
) | ||
} | ||
|
||
# Find Existing FarmRegistry Records | ||
farm_ids = list(farm_mapping.values()) # Get farm IDs | ||
existing_farm_registry = { | ||
fr.farm_id: fr for fr in FarmRegistry.objects.filter( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can use 'registry_id' column from the parquet to filter by 'id__in'. |
||
farm_id__in=farm_ids | ||
) | ||
} | ||
|
||
updates = [] | ||
for row in grid_crop_df.itertuples(index=False): | ||
farm_id = farm_mapping.get(row.grid_id) | ||
if farm_id: | ||
# Ensure we're updating an existing record | ||
farm_registry_instance = existing_farm_registry.get(farm_id) | ||
if farm_registry_instance: | ||
farm_registry_instance.crop_growth_stage_id = ( | ||
row.growth_stage_id | ||
) | ||
farm_registry_instance.growth_stage_start_date = ( | ||
row.growth_stage_start_date | ||
) | ||
updates.append(farm_registry_instance) | ||
|
||
# Bulk Update | ||
if updates: | ||
with transaction.atomic(): | ||
FarmRegistry.objects.bulk_update( | ||
updates, [ | ||
"crop_growth_stage_id", | ||
"growth_stage_start_date" | ||
] | ||
) | ||
|
||
def _append_grid_crop_meta(self, farm_df_meta: pd.DataFrame): | ||
# load from grid_crop data | ||
grid_crop_df_meta = self.data_query.read_grid_data_crop_meta_parquet( | ||
|
@@ -470,6 +541,7 @@ | |
self.process_grid_crop_data() | ||
|
||
self.process_farm_registry_data() | ||
self.update_farm_registry_growth_stage() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please call this method to celery task with input of request_id from DCASRequest |
||
csv_file = self.extract_csv_output() | ||
|
||
self.send_csv_to_sftp(csv_file) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@osundwajeff this should be query to the farm parquet files like you did on error handling. It needs to accept year, month, and day from the DCASRequest.requested_at.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we also need to use chunks to avoid loading all data in the memory.