twitter · pitmonticone · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -20,7 +20,7 @@
 # https://docs.python.org/3/tutorial/modules.html#more-on-modules
 epochMillis = 1000 * time.time()
 useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory = True
-# Use this size threshld to isolate code which should be run differently in small
+# Use this size threshold to isolate code which should be run differently in small
 # scale unit tests.
 minNumNotesForProdData = 200
 

diff --git a/sourcecode/scoring/contributor_state.py b/sourcecode/scoring/contributor_state.py
@@ -8,7 +8,7 @@
 def should_earn_in(contributorScoresWithEnrollment: pd.DataFrame):
   """
   The participant should earn in when they are in the earnedOutAcknowledged and newUser state.
-  To earn in, we need to check that the rating impact is larger than the succesfully ratings
+  To earn in, we need to check that the rating impact is larger than the successfully ratings
   needed to earn in. This constant is fixed for new users (ratingImpactForEarnIn), for
   earnedOutNoAcknowledge it will be set int the CombineEventAndSnapshot job to +5 their current
   rating impact with a minimum of ratingImpactForEarnIn.
@@ -118,7 +118,7 @@ def _get_visible_rating_counts(
 ) -> pd.DataFrame:
   """
   Given scored notes from the algorithm, all ratings, and note status history, this function
-  analyzes how succesfully a user rates notes. It aggregates how successfully/unsucessfully
+  analyzes how successfully a user rates notes. It aggregates how successfully/unsuccessfully
   a notes ratings aligns with a contributors ratings.
 
   Args:
@@ -452,7 +452,7 @@ def get_contributor_scores(
 ) -> pd.DataFrame:
   """
   Given the outputs of the MF model, this function aggregates stats over notes and ratings. The
-  contributor scores are merged and attached to helfpulness scores in the algorithm.
+  contributor scores are merged and attached to helpfulness scores in the algorithm.
 
   Args:
       scoredNotes (pd.DataFrame): scored notes

diff --git a/sourcecode/scoring/mf_core_scorer.py b/sourcecode/scoring/mf_core_scorer.py
@@ -82,7 +82,7 @@ def filter_core_input(
   # and (2) less than half of the ratings are from CORE users.  Any other note is considered
   # a CORE note.  This construction means that we only count a note as EXPANSION when there
   # is reason to believe that the EXPANSION model could assign the note status.  In all other
-  # case we leave the note as CORE so that the note will be eligble for locking.  In effect,
+  # case we leave the note as CORE so that the note will be eligible for locking.  In effect,
   # this approach biases us towards locking note status at 2 weeks and only avoiding locking
   # when a note is scored by the EXPANSION model.
   print(f"  Total notes: {len(noteStatusHistory)}")

diff --git a/sourcecode/scoring/mf_group_scorer.py b/sourcecode/scoring/mf_group_scorer.py
@@ -28,7 +28,7 @@ def _coalesce_columns(df: pd.DataFrame, columnPrefix: str) -> pd.DataFrame:
 
   Args:
     df: DataFrame containing columns to condense
-    collumnPrefix: Prefix used to detect columns to coalesce, and the name for
+    columnPrefix: Prefix used to detect columns to coalesce, and the name for
       the output column.
 
   Returns:

diff --git a/sourcecode/scoring/note_ratings.py b/sourcecode/scoring/note_ratings.py
@@ -159,7 +159,7 @@ def get_ratings_with_scores(
   doTypeCheck: bool = True,
 ) -> pd.DataFrame:
   """
-  This funciton merges the note status history, ratings, and scores for later aggregation.
+  This function merges the note status history, ratings, and scores for later aggregation.
 
   Args:
       ratings (pd.DataFrame): all ratings
@@ -368,7 +368,7 @@ def compute_scored_notes(
   is_crnh_ucb_function: Callable[..., pd.Series] = is_crnh_ucb,
 ) -> pd.DataFrame:
   """
-  Merges note status history, ratings, and model output. It annotes the data frame with
+  Merges note status history, ratings, and model output. It annotates the data frame with
   different note statuses, and features needed to calculate contributor stats.
 
   Args:

diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
@@ -44,7 +44,7 @@ def tsv_parser(
   """Parse a TSV input and raise an Exception if the input is not formatted as expected.
 
   Args:
-    rawTSV: str contianing entire TSV input
+    rawTSV: str containing entire TSV input
     mapping: Dict mapping column names to types
     columns: List of column names
     header: bool indicating whether the input will have a header

diff --git a/sourcecode/scoring/reputation_matrix_factorization/diligence_model.py b/sourcecode/scoring/reputation_matrix_factorization/diligence_model.py
@@ -30,7 +30,7 @@ def get_low_diligence_intercepts(
     # Model hyperparameters
     activationFunction="IDENTITY",
     nDim=1,
-    # Optimizaiton hyperparameters
+    # Optimization hyperparameters
     numEpochs=300,
     logRate=30,
     learningRate=0.2,

diff --git a/sourcecode/scoring/reputation_matrix_factorization/helpfulness_model.py b/sourcecode/scoring/reputation_matrix_factorization/helpfulness_model.py
@@ -22,7 +22,7 @@ def get_helpfulness_reputation_results(
     # Model hyperparameters
     activationFunction="IDENTITY",
     nDim=1,
-    # Optimizaiton hyperparameters
+    # Optimization hyperparameters
     numEpochs=300,
     logRate=30,
     learningRate=0.2,

diff --git a/sourcecode/scoring/run_scoring.py b/sourcecode/scoring/run_scoring.py
@@ -2,7 +2,7 @@
 
 This file defines "run_scoring" which invokes all Community Notes scoring algorithms,
 merges results and computes contribution statistics for users.  run_scoring should be
-intergrated into main files for execution in internal and external environments.
+integrated into main files for execution in internal and external environments.
 """
 
 from collections import namedtuple
@@ -385,7 +385,7 @@ def _compute_note_stats(
   scorers have run guarantees completeness over all Community Notes data.
 
   Args:
-    ratings: pd.DataFrame continaing *all* ratings on *all* notes from *all* users.
+    ratings: pd.DataFrame containing *all* ratings on *all* notes from *all* users.
     noteStatusHistory: pd.DataFrame containing complete noteStatusHistory for all notes.
 
   Returns:
@@ -441,7 +441,7 @@ def _compute_helpfulness_scores(
       helpfulnessScores pd.DataFrame: one row per user containing a column for each helpfulness score.
   """
   with c.time_block("Meta Helpfulness Scorers: Setup"):
-    # Generate a uunified view of note scoring information for computing contributor stats
+    # Generate a unified view of note scoring information for computing contributor stats
     assert len(scoredNotes) == len(auxiliaryNoteInfo), "notes in both note inputs must match"
     scoredNotesWithStats = scoredNotes.merge(
       # noteId and timestamp are the only common fields, and should always be equal.

diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py
@@ -62,7 +62,7 @@ class ScoringRule(ABC):
   """Scoring logic describing how to assign a ratingStatus given raw scoring signals and note attributes.
 
   Each ScoringRule must have a name and version. Each ScoringRule must implement a score_notes function,
-  which accepts as input the raw attributes of notes and currently assigned lables and returns (1) a
+  which accepts as input the raw attributes of notes and currently assigned labels and returns (1) a
   DataFrame specifying the noteIDs and associated status which the rule will assign, and (2) a DF
   containing any new columns which should be added to the output for those noteIDs.
   """
@@ -96,7 +96,7 @@ def score_notes(
     """Identify which notes the ScoringRule should be active for, and any new columns to add for those notes.
 
     Args:
-      noteStats: Raw note attributes, scoring signals and attirbutes for notes.
+      noteStats: Raw note attributes, scoring signals and attributes for notes.
       currentLabels: the ratingStatus assigned to each note from prior ScoringRules.
       statusColumn: str indicating column where status should be assigned.
 
@@ -158,7 +158,7 @@ def score_notes(
     """Returns noteIDs for notes matched by the boolean function."""
     mask = self._function(noteStats)
     if self._onlyApplyToNotesThatSayTweetIsMisleading:
-      # Check for inequality with "not misleading" to include notes whose classificaiton
+      # Check for inequality with "not misleading" to include notes whose classification
       # is nan (i.e. deleted notes).
       mask = mask & (noteStats[c.classificationKey] != c.noteSaysTweetIsNotMisleadingKey)
 
@@ -255,15 +255,15 @@ def score_notes(
         print(f"outlier filtering disabled for tag: {tag}")
         continue
       tagFilteredNotes = crhStats[
-        # Adjusted total must pass minimum threhsold set across all tags.
+        # Adjusted total must pass minimum threshold set across all tags.
         (crhStats[adjustedColumn] > self._minAdjustedTotal)
         # Adjusted ratio must exceed percentile based total for this specific tag.
         & (crhStats[adjustedRatioColumn] > thresholds[adjustedRatioColumn])
       ][c.noteIdKey]
       impactedNotes = pd.concat(
         [impactedNotes, pd.DataFrame({c.noteIdKey: tagFilteredNotes, c.activeFilterTagsKey: tag})]
       )
-    # log and consolidate imapcted notes
+    # log and consolidate impacted notes
     print(f"Total {{note, tag}} pairs where tag filter logic triggered: {len(impactedNotes)}")
     impactedNotes = impactedNotes.groupby(c.noteIdKey).aggregate(list).reset_index()
     impactedNotes[c.activeFilterTagsKey] = [
@@ -293,7 +293,7 @@ def __init__(
       dependencies: Rules which must run before this rule can run.
       status: the status which each note should be set to (e.g. CRH, CRNH, NMR)
       tagThreshold: threshold for number of included raters to issue a tag
-      voteThreshold: threshold for number of included raters (raters must have issued a NH tag to be inclueed)
+      voteThreshold: threshold for number of included raters (raters must have issued a NH tag to be included)
       weightedTotalVotes: For the filter to trigger, the sum of weighted incorrect votes must
         exceed the minAdjustedTotal.
       superThreshold: if set, allow notes with an intercept above threshold to bypass the filter.
@@ -668,7 +668,7 @@ def score_notes(
         & (noteStats[c.internalNoteInterceptKey] >= self._threshold)
         # Note must have been rated CRH during the last scoring run.
         & (noteStats[c.currentLabelKey] == c.currentlyRatedHelpful)
-        # Check for inequality with "not misleading" to include notes whose classificaiton
+        # Check for inequality with "not misleading" to include notes whose classification
         # is nan (i.e. deleted notes).
         & (noteStats[c.classificationKey] != c.noteSaysTweetIsNotMisleadingKey)
       ][[c.noteIdKey]],
@@ -804,5 +804,5 @@ def apply_scoring_rules(
   )
   scoredNotes[c.awaitingMoreRatingsBoolKey] = scoredNotes[statusColumn] == c.needsMoreRatings
 
-  # Return completed DF including original noteStats signals merged wtih scoring results
+  # Return completed DF including original noteStats signals merged with scoring results
   return scoredNotes
diff --git a/sourcecode/scoring/tag_filter.py b/sourcecode/scoring/tag_filter.py
@@ -1,4 +1,4 @@
-"""Utilites for tag based scoring logic."""
+"""Utilities for tag based scoring logic."""
 
 from typing import Dict
 
@@ -125,7 +125,7 @@ def get_tag_thresholds(ratings: pd.DataFrame, percentile: int) -> Dict[str, floa
 
   Args:
     ratings: DataFrame containing adjusted ratio columns
-    percnetile: int in the range [0, 100)
+    percentile: int in the range [0, 100)
 
   Returns:
     Dictionary mapping adjusted ratio columns to a threshold value