Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add analysis functions for M&E results #13

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions ipod/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from thor.orbit_determination.fitted_orbits import FittedOrbitMembers

from ipod.utils import ExpectedMembers, analyze_me_output


def test_me_analysis():

# Initialize the expected members
data_expected = {
"orbit_id": ["orbit1", "orbit1", "orbit2", "orbit2"],
"obs_id": ["obs1", "obs2", "obs3", "obs4"],
"primary_designation": [
"designation1",
"designation1",
"designation2",
"designation2",
],
}
Comment on lines +9 to +18
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How are orbits with overlapping observations handled?


expected_members = ExpectedMembers.from_kwargs(
orbit_id=data_expected["orbit_id"],
obs_id=data_expected["obs_id"],
primary_designation=data_expected["primary_designation"],
)

# Initialize the initial members
data_initial = {
"orbit_id": ["orbit1", "orbit2", "orbit2"],
"obs_id": ["obs1", "obs3", "obs4"],
}
initial_members = FittedOrbitMembers.from_kwargs(
orbit_id=data_initial["orbit_id"], obs_id=data_initial["obs_id"]
)

# Initialize the ME output members
data_me = {
"orbit_id": ["orbit1", "orbit1", "orbit3"],
"obs_id": ["obs1", "obs2", "obs5"],
}
me_members = FittedOrbitMembers.from_kwargs(
orbit_id=data_me["orbit_id"], obs_id=data_me["obs_id"]
)

# Call the function with the test data
result = analyze_me_output(expected_members, initial_members, me_members)

# Check the results using asserts
assert result.loc["designation1", "num_missing_obs"] == 0
assert result.loc["designation1", "num_extra_obs"] == 0
assert result.loc["designation1", "num_bogus_obs"] == 0
assert result.loc["designation1", "num_initial_orbits_with_attributed_members"] == 1
assert result.loc["designation1", "num_result_orbits_with_attributed_members"] == 1
assert result.loc["designation1", "best_result_orbit_id"] == "orbit1"
assert result.loc["designation1", "initial_orbits_with_attributed_members"] == [
"orbit1"
]
assert result.loc["designation1", "result_orbits_with_attributed_members"] == [
"orbit1",
]

assert result.loc["designation2", "num_missing_obs"] == 2
assert result.loc["designation2", "num_extra_obs"] == 0
assert result.loc["designation2", "num_bogus_obs"] == 0
assert result.loc["designation2", "num_initial_orbits_with_attributed_members"] == 1
assert result.loc["designation2", "num_result_orbits_with_attributed_members"] == 0
assert result.loc["designation2", "best_result_orbit_id"] == "None"
assert result.loc["designation2", "initial_orbits_with_attributed_members"] == [
"orbit2"
]
assert result.loc["designation2", "result_orbits_with_attributed_members"] == []
139 changes: 139 additions & 0 deletions ipod/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Tuple

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import quivr as qv
Expand Down Expand Up @@ -356,3 +357,141 @@ def assign_duplicate_observations(
filtered_orbit_members = qv.defragment(filtered_orbit_members)

return filtered, filtered_orbit_members


class ExpectedMembers(qv.Table):
orbit_id = qv.LargeStringColumn()
obs_id = qv.LargeStringColumn()
primary_designation = qv.LargeStringColumn(nullable=True)


def analyze_me_output(
members_expected: ExpectedMembers,
members_initial: FittedOrbitMembers,
members_me: FittedOrbitMembers,
) -> pd.DataFrame:
"""
Analyze the output of the merge and extend process. This function will return
a dataframe with a breakdown of the number of missing, extra, and bogus observations
as well as merges for each primary designation in the expected_members table.
"""

expected_members_df = members_expected.to_dataframe()

# Merge the expected members with the initial members and the ME members to
# get expected attributed designations
attrib_merge = members_me.to_dataframe().merge(expected_members_df, on="obs_id")
initial_attrib_merge = members_initial.to_dataframe().merge(
expected_members_df, on="obs_id"
)
attrib_merge_grouped = attrib_merge.groupby("primary_designation")
initial_attrib_merge_grouped = initial_attrib_merge.groupby("primary_designation")

# keep track of all obs_ids that are acceptable for any designation
all_acceptable_obs_ids = expected_members_df[
~expected_members_df["primary_designation"].isna()
]["obs_id"].unique()

analysis_dict: dict = {}

# iterate through each designation and compare the expected members to the attributed members
for designation, members in expected_members_df.groupby("primary_designation"):
all_attribs_from_run = None

# count the number of initial_orbits with observations attributed to this designation
initial_orbits_with_attributed_members = initial_attrib_merge_grouped.get_group(
designation
)["orbit_id_x"].unique()
num_initial_orbits_with_designation = len(
initial_orbits_with_attributed_members
)

# get all members from the ME run that are attributed to this designation
try:
all_attribs_from_run = attrib_merge_grouped.get_group(designation)
except KeyError:
print(f"Designation {designation} not found in resulting members")
analysis_dict[designation] = {
"num_missing_obs": len(members),
"num_extra_obs": 0,
"num_bogus_obs": 0,
"num_initial_orbits_with_attributed_members": num_initial_orbits_with_designation,
"num_result_orbits_with_attributed_members": 0,
"best_result_orbit_id": "None",
"initial_orbits_with_attributed_members": initial_orbits_with_attributed_members,
"result_orbits_with_attributed_members": [],
}
continue

# loop through each expected orbit_id that is attributed to this designation
for orbit_id, orbit_members in members.groupby("orbit_id"):
all_attributed_orbits_from_run = all_attribs_from_run["orbit_id_x"].unique()
num_missing_obs = 0
num_extra_obs = 0
num_bogus_obs = 0

# get the obs_ids that are acceptable for other designations
# we use this to determine if an observation is bogus, rather
# than mis-attributed
acceptable_obs_ids_for_other_designations = set(
all_acceptable_obs_ids
) - set(members["obs_id"])

# we could have multiple orbits attributed to this designation
best_orbit_id = None
if len(all_attributed_orbits_from_run) > 1:

# keep track of the orbit with the fewest missing observations
best_missing_obs_count = None
for me_orb_id in all_attributed_orbits_from_run:
members_for_orbit = all_attribs_from_run[
all_attribs_from_run["orbit_id_x"] == me_orb_id
]
expected_obs_ids = orbit_members["obs_id"]
result_obs_ids = members_for_orbit["obs_id"]

# if this orbit has fewer missing observations, save it
if (
best_missing_obs_count is None
or len(set(expected_obs_ids) - set(result_obs_ids))
< best_missing_obs_count
):
num_missing_obs = len(
set(expected_obs_ids) - set(result_obs_ids)
)
best_missing_obs_count = num_missing_obs
num_extra_obs = len(set(result_obs_ids) - set(expected_obs_ids))
best_orbit_id = me_orb_id

# if we have only one orbit attributed to this designation
else:
expected_obs_ids = orbit_members["obs_id"]
result_obs_ids = all_attribs_from_run["obs_id"]
num_missing_obs = len(set(expected_obs_ids) - set(result_obs_ids))
num_extra_obs = len(set(result_obs_ids) - set(expected_obs_ids))
num_bogus_obs = len(
(set(result_obs_ids) - set(expected_obs_ids))
- acceptable_obs_ids_for_other_designations
)
best_orbit_id = all_attributed_orbits_from_run[0]

# if this is a new record or has less missing observations, save it
# essentially, store results for best-fit attribution
if (
designation not in analysis_dict.keys()
or num_missing_obs < analysis_dict[designation]["num_missing_obs"]
):
analysis_dict[designation] = {
"num_missing_obs": num_missing_obs,
"num_extra_obs": num_extra_obs,
"num_bogus_obs": num_bogus_obs,
"num_initial_orbits_with_attributed_members": num_initial_orbits_with_designation,
"num_result_orbits_with_attributed_members": len(
all_attributed_orbits_from_run
),
"best_result_orbit_id": best_orbit_id,
"initial_orbits_with_attributed_members": initial_orbits_with_attributed_members,
"result_orbits_with_attributed_members": all_attributed_orbits_from_run,
}

return pd.DataFrame(analysis_dict).T
Loading