Improve divident repair

Dividend repair tweaks: - handle coincident split missing from dividend - avoid 100x price changes confusing detection of div too-big/small - improve detecting div-date-wrong
ranaroussi · Oct 20, 2024 · 3152715 · 3152715
1 parent ee657b2
commit 3152715
Showing 1 changed file with 70 additions and 13 deletions.
diff --git a/yfinance/scrapers/history.py b/yfinance/scrapers/history.py
@@ -3,6 +3,7 @@
 import logging
 import numpy as np
 import pandas as pd
+from math import isclose
 import time as _time
 import bisect
 
@@ -1376,9 +1377,26 @@ def _fix_bad_div_adjust(self, df, interval, currency):
             # div_too_big_improvement_threshold = 1
             div_too_big_improvement_threshold = 2
 
-            drop_c2l = df2['Close'].iloc[div_idx-1] - df2['Low'].iloc[div_idx]
-            # drop_c2c = df2['Close'].iloc[div_idx-1] - df2['Close'].iloc[div_idx]
-            # drop = drop_c2c
+            if isclose(df2['Low'].iloc[div_idx], df2['Close'].iloc[div_idx-1]*100, rel_tol = 0.025):
+                # Price has jumped ~100x on ex-div day, need to fix immediately.
+                drop_c2l = df2['Close'].iloc[div_idx-1]*100 - df2['Low'].iloc[div_idx]
+                div_pct = div / (df2['Close'].iloc[div_idx-1]*100)
+                true_adjust = 1.0 - div / (df2['Close'].iloc[div_idx-1]*100)
+                present_adj = df2['Adj Close'].iloc[div_idx-1] / df2['Close'].iloc[div_idx-1]
+                if not isclose(present_adj, true_adjust, rel_tol = 0.025):
+                    df2.loc[:dt-_datetime.timedelta(seconds=1), 'Adj Close'] = true_adjust * df2['Close'].loc[:dt-_datetime.timedelta(seconds=1)]
+                    df2.loc[:dt-_datetime.timedelta(seconds=1), 'Repaired?'] = True
+            elif isclose(df2['Low'].iloc[div_idx], df2['Close'].iloc[div_idx-1]*0.01, rel_tol = 0.025):
+                # Price has dropped ~100x on ex-div day, need to fix immediately.
+                drop_c2l = df2['Close'].iloc[div_idx-1]*0.01 - df2['Low'].iloc[div_idx]
+                div_pct = div / (df2['Close'].iloc[div_idx-1]*0.01)
+                true_adjust = 1.0 - div / (df2['Close'].iloc[div_idx-1]*100)
+                present_adj = df2['Adj Close'].iloc[div_idx-1] / df2['Close'].iloc[div_idx-1]
+                if not isclose(present_adj, true_adjust, rel_tol = 0.025):
+                    df2.loc[:dt-_datetime.timedelta(seconds=1), 'Adj Close'] = true_adjust * df2['Close'].loc[:dt-_datetime.timedelta(seconds=1)]
+                    df2.loc[:dt-_datetime.timedelta(seconds=1), 'Repaired?'] = True
+            else:
+                drop_c2l = df2['Close'].iloc[div_idx-1] - df2['Low'].iloc[div_idx]
             drop = drop_c2l
             if div_idx < len(df2)-1:
                 # # In low-volume scenarios, the price drop is day after not today.
@@ -1388,8 +1406,10 @@ def _fix_bad_div_adjust(self, df, interval, currency):
                 # elif df2['Volume'].iloc[div_idx]==0:
                 #     if drop == 0.0:
                 #         drop = np.max(df2['Close'].iloc[div_idx-1:div_idx+1].to_numpy() - df2['Low'].iloc[div_idx:div_idx+2].to_numpy())
+                #
                 # Hmm, can I always look ahead 1 day? Catch: increases FP rate of div-too-small for tiny divs.
-                drops = df2['Close'].iloc[div_idx-1:div_idx+1].to_numpy() - df2['Low'].iloc[div_idx:div_idx+2].to_numpy()
+                # drops = df2['Close'].iloc[div_idx-1:div_idx+1].to_numpy() - df2['Low'].iloc[div_idx:div_idx+2].to_numpy()
+                drops = np.array([drop, df2['Close'].iloc[div_idx] - df2['Low'].iloc[div_idx+1]])
                 drop_2Dmax = np.max(drops)
             else:
                 drops = np.array([drop])
@@ -1728,21 +1748,27 @@ def cluster_dividends(df, column='div', threshold=7):
                 adjDeltas = x['Adj Low'].iloc[1:].to_numpy() - x['Adj Close'].iloc[:-1].to_numpy()
                 adjDeltas = np.append([0.0], adjDeltas)
                 x['adjDelta'] = adjDeltas
+                for i in np.where(x['Dividends']>0)[0]:
+                    x.loc[x.index[i], 'adjDelta'] += x['Dividends'].iloc[i]*x['Adj'].iloc[i]
                 deltas = x[['delta', 'adjDelta']]
-                if div_pct > 0.15 and div_pct < 1.0:  # avoid analysing impossibly-big dividends here
+                if div_pct > 0.05 and div_pct < 1.0:
                     adjDiv = div * x['Adj'].iloc[0]
                     f = deltas['adjDelta'] > (adjDiv*0.6)
                     if f.any():
-                        for idx in np.where(f)[0]:
-                            adjDelta_max_drop_idx = idx
-                            adjDelta_max_drop = deltas['adjDelta'].iloc[idx]
-                            if adjDelta_max_drop > 1.001*deltas['delta'].iloc[adjDelta_max_drop_idx]:
+                        indices = np.where(f)[0]
+                        for idx in indices:
+                            adjDelta_drop = deltas['adjDelta'].iloc[idx]
+                            if adjDelta_drop > 1.001*deltas['delta'].iloc[idx]:
                                 # Adjusted price has risen by more than unadjusted, should not happen.
                                 # See if Adjusted price later falls by a similar amount. This would mean
                                 # dividend has been applied too early.
-                                ratios = (-1*deltas['adjDelta'])/adjDelta_max_drop
+                                ratios = (-1*deltas['adjDelta'])/adjDelta_drop
                                 f_near1_or_above = ratios>=0.8
-                                if f_near1_or_above.any():
+                                # Update: only check for wrong date if no coincident split.
+                                # Because if a split, more likely the div is missing split
+                                split = df2['Stock Splits'].loc[dt]
+                                pre_split = div_status_df['div_pre_split'].loc[dt]
+                                if (split==0.0 or (not pre_split)) and f_near1_or_above.any():
                                     near_indices = np.where(f_near1_or_above)[0]
                                     if len(near_indices) > 1:
                                         penalties = np.zeros(len(near_indices))
@@ -1760,7 +1786,7 @@ def cluster_dividends(df, column='div', threshold=7):
                                     div_date_wrong = True
                                     div_true_date = ratios.index[reversal_idx]
                                     break
-                                elif adjDelta_max_drop > 0.39*adjDiv:
+                                elif adjDelta_drop > 0.39*adjDiv:
                                     # Still true that applied adjustment exceeds price action, 
                                     # just not clear what solution is (if any).
                                     div_adj_exceeds_prices = True
@@ -1777,6 +1803,27 @@ def cluster_dividends(df, column='div', threshold=7):
             div_status['div_date_wrong'] = div_date_wrong
             div_status['div_true_date'] = div_true_date
 
+            if div_adj_exceeds_prices:
+                split = df2['Stock Splits'].loc[dt]
+                if split != 0.0:
+                    # Check again if div missing split. Use looser tolerance
+                    # as we know the adjustment seems wrong.
+                    div_postSplit = div / split
+                    if div_postSplit > div:
+                        # Use volatility-adjusted drop
+                        typical_volatility = div_status_df['vol'].loc[dt]
+                        drop = div_status_df['drop'].loc[dt]
+                        _drop = drop - typical_volatility
+                    else:
+                        drop_2Dmax = div_status_df['drop_2Dmax'].loc[dt]
+                        _drop = drop_2Dmax
+                    if _drop > 0:
+                        diff = abs(div-_drop)
+                        diff_postSplit = abs(div_postSplit-_drop)
+                        if diff_postSplit <= (diff*1.1):
+                            # possibilities.append({'state':'div-pre-split', 'diff':diff_postSplit})
+                            div_status_df.loc[dt, 'div_pre_split'] = True
+
             for k,v in div_status.items():
                 if k not in div_status_df:
                     if isinstance(v, (bool, np.bool_)):
@@ -1852,7 +1899,7 @@ def cluster_dividends(df, column='div', threshold=7):
                     if 'div_date_wrong' in cluster.columns and (cluster[c] == cluster['div_date_wrong']).all():
                         continue
 
-                    if 'adj_exceeds_prices' in cluster.columns and (cluster[c] == cluster['adj_exceeds_prices']).all():
+                    if 'adj_exceeds_prices' in cluster.columns and (cluster[c] == (cluster[c] & cluster['adj_exceeds_prices'])).all():
                         # More likely that true-positive. Maybe the div never happened
                         continue
 
@@ -1876,6 +1923,11 @@ def cluster_dividends(df, column='div', threshold=7):
 
                     if pct_fail >= true_threshold:
                         div_status_df.loc[fc, c] = True
+                        if 'div_date_wrong' in div_status_df.columns:
+                            # reset this as well
+                            div_status_df.loc[fc, 'div_date_wrong'] = False
+                            div_status_df.loc[fc, 'div_true_date'] = pd.NaT
+                            cluster = div_status_df[fc].sort_index()
                         continue
                     elif pct_fail <= fals_threshold:
                         div_status_df.loc[fc, c] = False
@@ -1961,6 +2013,11 @@ def cluster_dividends(df, column='div', threshold=7):
                         div_too_big = False
                         cluster.loc[dt, 'div_too_big'] = False
                         n_failed_checks -= 1
+                    if div_exceeds_adj:
+                        # false-positive
+                        div_exceeds_adj = False
+                        cluster.loc[dt, 'div_exceeds_adj'] = False
+                        n_failed_checks -= 1
 
                 if div_pre_split:
                     if adj_exceeds_prices: