From 0a7fc79212cf67dcfb409987a708c92ab18d6eb0 Mon Sep 17 00:00:00 2001
From: dacowan2 <56355242+dcstats@users.noreply.github.com>
Date: Mon, 19 Sep 2022 23:35:34 -0500
Subject: [PATCH 1/7] added info about neutral/conf games
---
src/cbbpy/mens_scraper.py | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py
index ff42d34..ed28466 100755
--- a/src/cbbpy/mens_scraper.py
+++ b/src/cbbpy/mens_scraper.py
@@ -425,6 +425,23 @@ def get_game_info(game_id: str) -> pd.DataFrame:
game_r2 = np.nan
game_r3 = np.nan
+ conf_home = 'conf' in home_div.get_text().lower()
+ conf_away = 'conf' in away_div.get_text().lower()
+ home_home = 'home' in home_div.get_text().lower()
+ away_away = 'away' in away_div.get_text().lower()
+
+ if conf_home and conf_away:
+ is_conf = True
+ else:
+ is_conf = False
+
+ if home_home or away_away:
+ is_neutral = False
+ elif is_conf and not type(game_meta) == str:
+ is_neutral = False
+ else:
+ is_neutral = True
+
# AGGREGATE DATA INTO DATAFRAME AND RETURN
game_info_list = [
game_id,
@@ -440,6 +457,8 @@ def get_game_info(game_id: str) -> pd.DataFrame:
away_score,
home_win,
num_ots,
+ is_conf,
+ is_neutral,
game_meta,
game_day,
game_time,
@@ -467,6 +486,8 @@ def get_game_info(game_id: str) -> pd.DataFrame:
"away_score",
"home_win",
"num_ots",
+ "is_conference",
+ "is_neutral",
"tournament",
"game_day",
"game_time",
@@ -532,7 +553,7 @@ def get_games_season(season: int) -> tuple:
all_data.append(games_info_day)
else:
- t.set_description(f"No games on {date}")
+ t.set_description(f"No games on {date.strftime('%D')}")
date += timedelta(days=1)
From 3862d71d7f8d0c8edc2bcebb67636de60187326b Mon Sep 17 00:00:00 2001
From: dacowan2 <56355242+dcstats@users.noreply.github.com>
Date: Tue, 20 Sep 2022 21:00:25 -0500
Subject: [PATCH 2/7] edge cases - boxscore tot
---
src/cbbpy/mens_scraper.py | 93 +++++++++++++++++++++++++++++++++++++--
1 file changed, 90 insertions(+), 3 deletions(-)
diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py
index ed28466..b2d98e6 100755
--- a/src/cbbpy/mens_scraper.py
+++ b/src/cbbpy/mens_scraper.py
@@ -188,7 +188,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame:
# no exception thrown
break
- return pd.concat([df_home, df_away])
+ return pd.concat([df_home, df_away]).reset_index(drop=True)
def get_game_pbp(game_id: str) -> pd.DataFrame:
@@ -250,7 +250,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame:
# no exception thrown
break
- return pd.concat(pbp_halves)
+ return pd.concat(pbp_halves).reset_index(drop=True)
def get_game_info(game_id: str) -> pd.DataFrame:
@@ -667,6 +667,32 @@ def _clean_boxscore_table(table, team, game_id):
df.pf = pd.to_numeric(df.pf, errors='coerce')
df.pts = pd.to_numeric(df.pts, errors='coerce')
+ # TOTALS ROW
+ tot_row = [row for row in all_rows if 'TEAM' in row.get_text()]
+ tot_t = "
"
+ tot_t += str(tot_row[0])
+ tot_t += "
"
+
+ df_tot = pd.read_html(tot_t)[0]
+
+ df_tot.columns = df.columns
+ # type handling
+ df_tot.starters = df_tot.starters.astype(str)
+ df_tot['min'] = pd.to_numeric(df_tot['min'], errors='coerce')
+ df_tot.fg = df_tot.fg.astype(str)
+ df_tot['3pt'] = df_tot['3pt'].astype(str)
+ df_tot.ft = df_tot.ft.astype(str)
+ df_tot.oreb = pd.to_numeric(df_tot.oreb, errors='coerce')
+ df_tot.dreb = pd.to_numeric(df_tot.dreb, errors='coerce')
+ df_tot.reb = pd.to_numeric(df_tot.reb, errors='coerce')
+ df_tot.ast = pd.to_numeric(df_tot.ast, errors='coerce')
+ df_tot.stl = pd.to_numeric(df_tot.stl, errors='coerce')
+ df_tot.blk = pd.to_numeric(df_tot.blk, errors='coerce')
+ df_tot.to = pd.to_numeric(df_tot.to, errors='coerce')
+ df_tot.pf = pd.to_numeric(df_tot.pf, errors='coerce')
+ df_tot.pts = pd.to_numeric(df_tot.pts, errors='coerce')
+
+ # START BY CLEANING PLAYER BOXSCORES
# GET PLAYER IDS
ids = [x.find("a")["href"].split("/")[-2]
for x in good_rows if x.find("a")]
@@ -718,7 +744,68 @@ def _clean_boxscore_table(table, team, game_id):
df.insert(14, "fta", fta)
df['fta'] = pd.to_numeric(df['fta'], errors='coerce')
- return df
+ # THEN CLEAN TOTAL ROW
+ # SPLIT UP THE FG FIELDS
+ fgm = [x.split('-')[0]
+ if x.split('-')[0] != ''
+ else np.nan
+ for x in df_tot['fg']]
+ fga = [x.split('-')[1]
+ if x.split('-')[1] != ''
+ else np.nan
+ for x in df_tot['fg']]
+ thpm = [x.split('-')[0]
+ if x.split('-')[0] != ''
+ else np.nan
+ for x in df_tot['3pt']]
+ thpa = [x.split('-')[1]
+ if x.split('-')[1] != ''
+ else np.nan
+ for x in df_tot['3pt']]
+ ftm = [x.split('-')[0]
+ if x.split('-')[0] != ''
+ else np.nan
+ for x in df_tot['ft']]
+ fta = [x.split('-')[1]
+ if x.split('-')[1] != ''
+ else np.nan
+ for x in df_tot['ft']]
+
+ # GET RID OF UNWANTED COLUMNS
+ df_tot = df_tot.drop(columns=["fg", "3pt", "ft"])
+
+ df_tot = df_tot.rename(columns={"starters": "player"})
+ df_tot['player'] = 'TEAM'
+
+ # INSERT COLUMNS WHERE NECESSARY
+ df_tot.insert(0, "game_id", game_id)
+ df_tot.game_id = df_tot.game_id.astype(str)
+ df_tot.insert(1, "team", team)
+ df_tot.team = df_tot.team.astype(str)
+ df_tot.insert(3, "player_id", 'TOTAL')
+ df_tot.player_id = df_tot.player_id.astype(str)
+ df_tot.insert(4, "position", 'TOTAL')
+ df_tot.position = df_tot.position.astype(str)
+ df_tot.insert(5, "starter", 0)
+ df_tot.starter = df_tot.starter.astype(bool)
+ df_tot.insert(7, "fgm", fgm)
+ df_tot.fgm = pd.to_numeric(df_tot.fgm, errors='coerce')
+ df_tot.insert(8, "fga", fga)
+ df_tot.fga = pd.to_numeric(df_tot.fga, errors='coerce')
+ df_tot.insert(9, "2pm", [float(x) - float(y) for x, y in zip(fgm, thpm)])
+ df_tot['2pm'] = pd.to_numeric(df_tot['2pm'], errors='coerce')
+ df_tot.insert(10, "2pa", [float(x) - float(y) for x, y in zip(fga, thpa)])
+ df_tot['2pa'] = pd.to_numeric(df_tot['2pa'], errors='coerce')
+ df_tot.insert(11, "3pm", thpm)
+ df_tot['3pm'] = pd.to_numeric(df_tot['3pm'], errors='coerce')
+ df_tot.insert(12, "3pa", thpa)
+ df_tot['3pa'] = pd.to_numeric(df_tot['3pa'], errors='coerce')
+ df_tot.insert(13, "ftm", ftm)
+ df_tot['ftm'] = pd.to_numeric(df_tot['ftm'], errors='coerce')
+ df_tot.insert(14, "fta", fta)
+ df_tot['fta'] = pd.to_numeric(df_tot['fta'], errors='coerce')
+
+ return pd.concat([df, df_tot])
def _get_pbp_map(soup):
From 531e0e69d59bba836cd9dfd0d137dfaa638c44a2 Mon Sep 17 00:00:00 2001
From: dacowan2 <56355242+dcstats@users.noreply.github.com>
Date: Mon, 26 Sep 2022 16:44:57 -0500
Subject: [PATCH 3/7] parses and boxscore
---
src/cbbpy/mens_scraper.py | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py
index b2d98e6..bd251d2 100755
--- a/src/cbbpy/mens_scraper.py
+++ b/src/cbbpy/mens_scraper.py
@@ -25,11 +25,12 @@
_log = logging.getLogger(__name__)
ATTEMPTS = 10
-DATE_PARSES = ['%Y-%m-%d',
- '%Y/%m/%d',
- '%m-%d-%Y',
- '%m/%d/%Y',
- ]
+DATE_PARSES = [
+ '%Y-%m-%d',
+ '%Y/%m/%d',
+ '%m-%d-%Y',
+ '%m/%d/%Y',
+]
USER_AGENTS = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' +
'(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
@@ -632,6 +633,9 @@ def get_game_ids(date: Union[str, datetime]) -> list:
def _clean_boxscore_table(table, team, game_id):
"""A helper function to clean the DataFrame returned by get_game_boxscore"""
+ if len(table.find_all("thead")) == 0:
+ return pd.DataFrame([])
+
# GET RID OF UNWANTED ROWS
all_rows = table.find_all("tr")
bad_rows_a = table.find_all("thead")[1].find_all("tr")
From a947f22806aee3c5e6c2800461eb215ae7214710 Mon Sep 17 00:00:00 2001
From: dacowan2 <56355242+dcstats@users.noreply.github.com>
Date: Mon, 26 Sep 2022 16:48:59 -0500
Subject: [PATCH 4/7] boxscore
---
src/cbbpy/mens_scraper.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py
index bd251d2..9c46b49 100755
--- a/src/cbbpy/mens_scraper.py
+++ b/src/cbbpy/mens_scraper.py
@@ -633,7 +633,7 @@ def get_game_ids(date: Union[str, datetime]) -> list:
def _clean_boxscore_table(table, team, game_id):
"""A helper function to clean the DataFrame returned by get_game_boxscore"""
- if len(table.find_all("thead")) == 0:
+ if len(table.find_all("thead")) <= 1:
return pd.DataFrame([])
# GET RID OF UNWANTED ROWS
From ce0d0311f7ca2ee2f61e270c1924bdf3ef247d9f Mon Sep 17 00:00:00 2001
From: dacowan2 <56355242+dcstats@users.noreply.github.com>
Date: Wed, 28 Sep 2022 00:15:20 -0500
Subject: [PATCH 5/7] #17 add is_postseason
---
src/cbbpy/mens_scraper.py | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py
index 9c46b49..88738d4 100755
--- a/src/cbbpy/mens_scraper.py
+++ b/src/cbbpy/mens_scraper.py
@@ -96,6 +96,25 @@
"TBD",
"Suspended"
]
+TOURN_WORDS = [
+ 'tournament',
+ 'championship',
+ 'playoff',
+ '1st round',
+ '2nd round',
+ 'quarterfinal',
+ 'semifinal',
+ 'final'
+]
+
+TOURN_SPEC = [
+ 'cit ',
+ 'cbi ',
+ 'nit - ',
+ "men's basketball championship",
+ 'the basketball classic',
+ 'vegas 16',
+]
class CouldNotParseError(Exception):
@@ -443,6 +462,11 @@ def get_game_info(game_id: str) -> pd.DataFrame:
else:
is_neutral = True
+ game_meta = str(game_meta)
+
+ is_postseason = (any(x in game_meta.lower() for x in TOURN_WORDS) or
+ any(x in game_meta.lower() for x in TOURN_SPEC))
+
# AGGREGATE DATA INTO DATAFRAME AND RETURN
game_info_list = [
game_id,
@@ -460,6 +484,7 @@ def get_game_info(game_id: str) -> pd.DataFrame:
num_ots,
is_conf,
is_neutral,
+ is_postseason,
game_meta,
game_day,
game_time,
@@ -489,6 +514,7 @@ def get_game_info(game_id: str) -> pd.DataFrame:
"num_ots",
"is_conference",
"is_neutral",
+ "is_postseason",
"tournament",
"game_day",
"game_time",
From 9ea5f9f161ccf63b1fc69ef7dc736fec33ec7788 Mon Sep 17 00:00:00 2001
From: dacowan2 <56355242+dcstats@users.noreply.github.com>
Date: Wed, 28 Sep 2022 21:32:24 -0500
Subject: [PATCH 6/7] Update pyproject.toml
---
pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index 304a581..c4a6c87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "CBBpy"
-version = "1.0.3"
+version = "1.0.4"
description = 'A Python-based web scraper for NCAA basketball.'
readme = "README.md"
authors = [{ name = "Daniel Cowan", email = "dnlcowan37@gmail.com" }]
From bdae9fed40037fbc7b2ba055a3272c4fa792d6e4 Mon Sep 17 00:00:00 2001
From: dacowan2 <56355242+dcstats@users.noreply.github.com>
Date: Thu, 29 Sep 2022 00:57:55 -0500
Subject: [PATCH 7/7] fix #19
---
src/cbbpy/mens_scraper.py | 25 +++++++++++++++----------
1 file changed, 15 insertions(+), 10 deletions(-)
diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py
index 88738d4..93e46f1 100755
--- a/src/cbbpy/mens_scraper.py
+++ b/src/cbbpy/mens_scraper.py
@@ -393,21 +393,26 @@ def get_game_info(game_id: str) -> pd.DataFrame:
game_network = np.nan
game_arena_pre = game_info_div.find(
- "div", {"class": "caption-wrapper"})
+ 'div', {'class': 'caption-wrapper'})
if not game_arena_pre:
div_loc = game_info_div.find(
- "div", {"class": "location-details"})
- game_arena = div_loc.find(
- "span", {"class": "game-location"}).get_text().strip()
- game_loc = div_loc.find(
- "div", {"class": "game-location"}).get_text().strip()
+ 'div', {'class': 'location-details'})
+ game_arena = div_loc.find('span', {'class': 'game-location'})
+
+ if game_arena:
+ game_arena = game_arena.get_text().strip()
+ game_loc = div_loc.find(
+ 'div', {'class': 'game-location'}).get_text().strip()
+
+ else:
+ game_arena = game_info_div.find(
+ 'div', {'class': 'game-location'}).get_text().strip()
+ game_loc = None
else:
game_arena = game_arena_pre.get_text().strip()
- game_loc = (
- game_info_div.find(
- "div", {"class": "game-location"}).get_text().strip()
- )
+ game_loc = game_info_div.find(
+ 'div', {'class': 'game-location'}).get_text().strip()
game_cap_pre = game_info_div.find_all(
"div", {"class": "game-info-note capacity"})