From 0a7fc79212cf67dcfb409987a708c92ab18d6eb0 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Mon, 19 Sep 2022 23:35:34 -0500 Subject: [PATCH 1/7] added info about neutral/conf games --- src/cbbpy/mens_scraper.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index ff42d34..ed28466 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -425,6 +425,23 @@ def get_game_info(game_id: str) -> pd.DataFrame: game_r2 = np.nan game_r3 = np.nan + conf_home = 'conf' in home_div.get_text().lower() + conf_away = 'conf' in away_div.get_text().lower() + home_home = 'home' in home_div.get_text().lower() + away_away = 'away' in away_div.get_text().lower() + + if conf_home and conf_away: + is_conf = True + else: + is_conf = False + + if home_home or away_away: + is_neutral = False + elif is_conf and not type(game_meta) == str: + is_neutral = False + else: + is_neutral = True + # AGGREGATE DATA INTO DATAFRAME AND RETURN game_info_list = [ game_id, @@ -440,6 +457,8 @@ def get_game_info(game_id: str) -> pd.DataFrame: away_score, home_win, num_ots, + is_conf, + is_neutral, game_meta, game_day, game_time, @@ -467,6 +486,8 @@ def get_game_info(game_id: str) -> pd.DataFrame: "away_score", "home_win", "num_ots", + "is_conference", + "is_neutral", "tournament", "game_day", "game_time", @@ -532,7 +553,7 @@ def get_games_season(season: int) -> tuple: all_data.append(games_info_day) else: - t.set_description(f"No games on {date}") + t.set_description(f"No games on {date.strftime('%D')}") date += timedelta(days=1) From 3862d71d7f8d0c8edc2bcebb67636de60187326b Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Tue, 20 Sep 2022 21:00:25 -0500 Subject: [PATCH 2/7] edge cases - boxscore tot --- src/cbbpy/mens_scraper.py | 93 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 3 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index ed28466..b2d98e6 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -188,7 +188,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: # no exception thrown break - return pd.concat([df_home, df_away]) + return pd.concat([df_home, df_away]).reset_index(drop=True) def get_game_pbp(game_id: str) -> pd.DataFrame: @@ -250,7 +250,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # no exception thrown break - return pd.concat(pbp_halves) + return pd.concat(pbp_halves).reset_index(drop=True) def get_game_info(game_id: str) -> pd.DataFrame: @@ -667,6 +667,32 @@ def _clean_boxscore_table(table, team, game_id): df.pf = pd.to_numeric(df.pf, errors='coerce') df.pts = pd.to_numeric(df.pts, errors='coerce') + # TOTALS ROW + tot_row = [row for row in all_rows if 'TEAM' in row.get_text()] + tot_t = "" + tot_t += str(tot_row[0]) + tot_t += "
" + + df_tot = pd.read_html(tot_t)[0] + + df_tot.columns = df.columns + # type handling + df_tot.starters = df_tot.starters.astype(str) + df_tot['min'] = pd.to_numeric(df_tot['min'], errors='coerce') + df_tot.fg = df_tot.fg.astype(str) + df_tot['3pt'] = df_tot['3pt'].astype(str) + df_tot.ft = df_tot.ft.astype(str) + df_tot.oreb = pd.to_numeric(df_tot.oreb, errors='coerce') + df_tot.dreb = pd.to_numeric(df_tot.dreb, errors='coerce') + df_tot.reb = pd.to_numeric(df_tot.reb, errors='coerce') + df_tot.ast = pd.to_numeric(df_tot.ast, errors='coerce') + df_tot.stl = pd.to_numeric(df_tot.stl, errors='coerce') + df_tot.blk = pd.to_numeric(df_tot.blk, errors='coerce') + df_tot.to = pd.to_numeric(df_tot.to, errors='coerce') + df_tot.pf = pd.to_numeric(df_tot.pf, errors='coerce') + df_tot.pts = pd.to_numeric(df_tot.pts, errors='coerce') + + # START BY CLEANING PLAYER BOXSCORES # GET PLAYER IDS ids = [x.find("a")["href"].split("/")[-2] for x in good_rows if x.find("a")] @@ -718,7 +744,68 @@ def _clean_boxscore_table(table, team, game_id): df.insert(14, "fta", fta) df['fta'] = pd.to_numeric(df['fta'], errors='coerce') - return df + # THEN CLEAN TOTAL ROW + # SPLIT UP THE FG FIELDS + fgm = [x.split('-')[0] + if x.split('-')[0] != '' + else np.nan + for x in df_tot['fg']] + fga = [x.split('-')[1] + if x.split('-')[1] != '' + else np.nan + for x in df_tot['fg']] + thpm = [x.split('-')[0] + if x.split('-')[0] != '' + else np.nan + for x in df_tot['3pt']] + thpa = [x.split('-')[1] + if x.split('-')[1] != '' + else np.nan + for x in df_tot['3pt']] + ftm = [x.split('-')[0] + if x.split('-')[0] != '' + else np.nan + for x in df_tot['ft']] + fta = [x.split('-')[1] + if x.split('-')[1] != '' + else np.nan + for x in df_tot['ft']] + + # GET RID OF UNWANTED COLUMNS + df_tot = df_tot.drop(columns=["fg", "3pt", "ft"]) + + df_tot = df_tot.rename(columns={"starters": "player"}) + df_tot['player'] = 'TEAM' + + # INSERT COLUMNS WHERE NECESSARY + df_tot.insert(0, "game_id", game_id) + df_tot.game_id = df_tot.game_id.astype(str) + df_tot.insert(1, "team", team) + df_tot.team = df_tot.team.astype(str) + df_tot.insert(3, "player_id", 'TOTAL') + df_tot.player_id = df_tot.player_id.astype(str) + df_tot.insert(4, "position", 'TOTAL') + df_tot.position = df_tot.position.astype(str) + df_tot.insert(5, "starter", 0) + df_tot.starter = df_tot.starter.astype(bool) + df_tot.insert(7, "fgm", fgm) + df_tot.fgm = pd.to_numeric(df_tot.fgm, errors='coerce') + df_tot.insert(8, "fga", fga) + df_tot.fga = pd.to_numeric(df_tot.fga, errors='coerce') + df_tot.insert(9, "2pm", [float(x) - float(y) for x, y in zip(fgm, thpm)]) + df_tot['2pm'] = pd.to_numeric(df_tot['2pm'], errors='coerce') + df_tot.insert(10, "2pa", [float(x) - float(y) for x, y in zip(fga, thpa)]) + df_tot['2pa'] = pd.to_numeric(df_tot['2pa'], errors='coerce') + df_tot.insert(11, "3pm", thpm) + df_tot['3pm'] = pd.to_numeric(df_tot['3pm'], errors='coerce') + df_tot.insert(12, "3pa", thpa) + df_tot['3pa'] = pd.to_numeric(df_tot['3pa'], errors='coerce') + df_tot.insert(13, "ftm", ftm) + df_tot['ftm'] = pd.to_numeric(df_tot['ftm'], errors='coerce') + df_tot.insert(14, "fta", fta) + df_tot['fta'] = pd.to_numeric(df_tot['fta'], errors='coerce') + + return pd.concat([df, df_tot]) def _get_pbp_map(soup): From 531e0e69d59bba836cd9dfd0d137dfaa638c44a2 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Mon, 26 Sep 2022 16:44:57 -0500 Subject: [PATCH 3/7] parses and boxscore --- src/cbbpy/mens_scraper.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index b2d98e6..bd251d2 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -25,11 +25,12 @@ _log = logging.getLogger(__name__) ATTEMPTS = 10 -DATE_PARSES = ['%Y-%m-%d', - '%Y/%m/%d', - '%m-%d-%Y', - '%m/%d/%Y', - ] +DATE_PARSES = [ + '%Y-%m-%d', + '%Y/%m/%d', + '%m-%d-%Y', + '%m/%d/%Y', +] USER_AGENTS = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', @@ -632,6 +633,9 @@ def get_game_ids(date: Union[str, datetime]) -> list: def _clean_boxscore_table(table, team, game_id): """A helper function to clean the DataFrame returned by get_game_boxscore""" + if len(table.find_all("thead")) == 0: + return pd.DataFrame([]) + # GET RID OF UNWANTED ROWS all_rows = table.find_all("tr") bad_rows_a = table.find_all("thead")[1].find_all("tr") From a947f22806aee3c5e6c2800461eb215ae7214710 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Mon, 26 Sep 2022 16:48:59 -0500 Subject: [PATCH 4/7] boxscore --- src/cbbpy/mens_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index bd251d2..9c46b49 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -633,7 +633,7 @@ def get_game_ids(date: Union[str, datetime]) -> list: def _clean_boxscore_table(table, team, game_id): """A helper function to clean the DataFrame returned by get_game_boxscore""" - if len(table.find_all("thead")) == 0: + if len(table.find_all("thead")) <= 1: return pd.DataFrame([]) # GET RID OF UNWANTED ROWS From ce0d0311f7ca2ee2f61e270c1924bdf3ef247d9f Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 28 Sep 2022 00:15:20 -0500 Subject: [PATCH 5/7] #17 add is_postseason --- src/cbbpy/mens_scraper.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 9c46b49..88738d4 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -96,6 +96,25 @@ "TBD", "Suspended" ] +TOURN_WORDS = [ + 'tournament', + 'championship', + 'playoff', + '1st round', + '2nd round', + 'quarterfinal', + 'semifinal', + 'final' +] + +TOURN_SPEC = [ + 'cit ', + 'cbi ', + 'nit - ', + "men's basketball championship", + 'the basketball classic', + 'vegas 16', +] class CouldNotParseError(Exception): @@ -443,6 +462,11 @@ def get_game_info(game_id: str) -> pd.DataFrame: else: is_neutral = True + game_meta = str(game_meta) + + is_postseason = (any(x in game_meta.lower() for x in TOURN_WORDS) or + any(x in game_meta.lower() for x in TOURN_SPEC)) + # AGGREGATE DATA INTO DATAFRAME AND RETURN game_info_list = [ game_id, @@ -460,6 +484,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: num_ots, is_conf, is_neutral, + is_postseason, game_meta, game_day, game_time, @@ -489,6 +514,7 @@ def get_game_info(game_id: str) -> pd.DataFrame: "num_ots", "is_conference", "is_neutral", + "is_postseason", "tournament", "game_day", "game_time", From 9ea5f9f161ccf63b1fc69ef7dc736fec33ec7788 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Wed, 28 Sep 2022 21:32:24 -0500 Subject: [PATCH 6/7] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 304a581..c4a6c87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "CBBpy" -version = "1.0.3" +version = "1.0.4" description = 'A Python-based web scraper for NCAA basketball.' readme = "README.md" authors = [{ name = "Daniel Cowan", email = "dnlcowan37@gmail.com" }] From bdae9fed40037fbc7b2ba055a3272c4fa792d6e4 Mon Sep 17 00:00:00 2001 From: dacowan2 <56355242+dcstats@users.noreply.github.com> Date: Thu, 29 Sep 2022 00:57:55 -0500 Subject: [PATCH 7/7] fix #19 --- src/cbbpy/mens_scraper.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index 88738d4..93e46f1 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -393,21 +393,26 @@ def get_game_info(game_id: str) -> pd.DataFrame: game_network = np.nan game_arena_pre = game_info_div.find( - "div", {"class": "caption-wrapper"}) + 'div', {'class': 'caption-wrapper'}) if not game_arena_pre: div_loc = game_info_div.find( - "div", {"class": "location-details"}) - game_arena = div_loc.find( - "span", {"class": "game-location"}).get_text().strip() - game_loc = div_loc.find( - "div", {"class": "game-location"}).get_text().strip() + 'div', {'class': 'location-details'}) + game_arena = div_loc.find('span', {'class': 'game-location'}) + + if game_arena: + game_arena = game_arena.get_text().strip() + game_loc = div_loc.find( + 'div', {'class': 'game-location'}).get_text().strip() + + else: + game_arena = game_info_div.find( + 'div', {'class': 'game-location'}).get_text().strip() + game_loc = None else: game_arena = game_arena_pre.get_text().strip() - game_loc = ( - game_info_div.find( - "div", {"class": "game-location"}).get_text().strip() - ) + game_loc = game_info_div.find( + 'div', {'class': 'game-location'}).get_text().strip() game_cap_pre = game_info_div.find_all( "div", {"class": "game-info-note capacity"})