Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/active_roster.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Roster

`active_roster(team)`

Get current 40-man roster for a given team. Contents of the table at
https://www.baseball-reference.com/teams/WSN/2025.shtml#all_the40man for example. Adds two columns: one
for player's bref ID, and one for the alternate URL for minor leaguers.

## Arguments
`team:` String. Must be a three-letter abbreviation that bref uses for an active MLB team.

## Examples of valid queries

```python
from pybaseball import active_roster

# get the Nationals' current 40-man roster
data = active_roster('WSN')

```
21 changes: 21 additions & 0 deletions docs/appearances_bref.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Appearances Bref

`appearances_bref(season)`

Get defensive appearances for a given season.

## Arguments
`season:` Integer. Defaults to the current calendar year if no value is provided.

## Examples of valid queries

```python
from pybaseball import appearances_bref

# get the current season's up-to-date appearances
data = appearances_bref()

# get the end-of-season appearances for the 1960 season
data = appearances_bref(1960)

```
2 changes: 2 additions & 0 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
statcast_fielding_run_value
)
from .league_batting_stats import batting_stats_bref
from .appearances_bref import appearances_bref
from .active_roster import active_roster
from .league_batting_stats import batting_stats_range
from .league_batting_stats import bwar_bat
from .league_pitching_stats import pitching_stats_bref
Expand Down
81 changes: 81 additions & 0 deletions pybaseball/active_roster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Optional

import pandas as pd
from bs4 import BeautifulSoup, Comment

from . import cache
from .utils import most_recent_season, get_bref_id_from_player_link, ACTIVE_TEAMS
from .datasources.bref import BRefSession

session = BRefSession()

def get_soup(team: str) -> BeautifulSoup:
url = f'https://www.baseball-reference.com/teams/{team}/{most_recent_season()}.shtml'
s = session.get(url).content
return BeautifulSoup(s, "lxml")

def get_tables(soup: BeautifulSoup) -> pd.DataFrame:
data = []

# find commented 40-man roster table and parse that
all_the40man = soup.find(id='all_the40man')
comment = all_the40man.find(text=lambda text: isinstance(text, Comment))
table_wrapper = BeautifulSoup(comment, 'lxml')
table = table_wrapper.find(id='the40man')

headings = [th.get_text() for th in table.find("tr").find_all("th")]

# remove the Rk header, it's unnecessary
headings.pop(0)

# add ID column name
headings.append('player_ID')
headings.append('Alt URL')

# pull in data rows
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
player_link = row.find('a')
if not player_link:
continue
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]

player_link = player_link.get('href')

# determine whether the player has reached the majors and has a bref ID
if player_link.startswith('/players/'):
# player has played in majors and has an id
cols.append(get_bref_id_from_player_link(player_link))
cols.append('')
else:
# player has not reached the majors, give them an alternate url
cols.append('')
cols.append(player_link)

data.append([ele for ele in cols])

# use headings for column names
return pd.DataFrame(data, columns=headings)


@cache.df_cache()
def active_roster(team: str) -> pd.DataFrame:
"""
Returns a pandas DataFrame of the 40-man roster for a given MLB team

ARGUMENTS
team (str): the three-letter bref abbreviation for an active MLB team
"""
# make sure specified team is active
if team not in ACTIVE_TEAMS:
raise ValueError(
"Team must be the three-letter abbreviation of an active MLB team."
)

# retrieve html from baseball reference
soup = get_soup(team)

df = get_tables(soup)
return df
70 changes: 70 additions & 0 deletions pybaseball/appearances_bref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from typing import Optional

import pandas as pd
from bs4 import BeautifulSoup

from . import cache
from .utils import most_recent_season, get_bref_id_from_player_link
from .datasources.bref import BRefSession

session = BRefSession()

def get_soup(year: int) -> BeautifulSoup:
url = f'https://www.baseball-reference.com/leagues/majors/{year}-appearances-fielding.shtml'
s = session.get(url).content
return BeautifulSoup(s, "lxml")

def get_tables(soup: BeautifulSoup, season: int) -> pd.DataFrame:
data = []

# get player appearances table
table = soup.find(id='appearances')
headings = [th.get_text() for th in table.find("tr").find_all("th")]

# remove the Rk header, it's unnecessary
headings.pop(0)

# add ID column name
headings.append('player_ID')

# pull in data rows
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
player_link = row.find('a')
if not player_link:
continue
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]

# find bref ID in player link and add to data
cols.append(get_bref_id_from_player_link(player_link.get('href')))

data.append([ele for ele in cols])

# use headings for column names
return pd.DataFrame(data, columns=headings)


@cache.df_cache()
def appearances_bref(season:Optional[int] = None) -> pd.DataFrame:
"""
Returns a pandas DataFrame of the defensive appearances for a given MLB season, or
appearances for the current / most recent season if the date is not specified.

ARGUMENTS
season (int): the year of the season
"""
# get most recent standings if date not specified
if season is None:
season = most_recent_season()
if season < 1871:
raise ValueError(
"This query currently only returns appearances until the 1871 season. "
"Try looking at years from 1871 to present."
)

# retrieve html from baseball reference
soup = get_soup(season)
df = get_tables(soup, season)
return df
40 changes: 40 additions & 0 deletions pybaseball/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from collections import namedtuple
from datetime import date, datetime, timedelta
import functools
Expand All @@ -7,6 +8,7 @@

import pandas as pd
import requests
from bs4 import Tag

from . import cache

Expand Down Expand Up @@ -84,6 +86,39 @@
{'WAS', 'WST'}
]

ACTIVE_TEAMS = [
'ARI',
'ATH',
'ATL',
'BAL',
'BOS',
'CHC',
'CHW',
'CIN',
'CLE',
'COL',
'DET',
'HOU',
'KCR',
'LAA',
'LAD',
'MIA',
'MIL',
'MIN',
'NYM',
'NYY',
'PHI',
'PIT',
'SDP',
'SEA',
'SFG',
'STL',
'TBR',
'TEX',
'TOR',
'WSN'
]

def get_first_season(team: str, include_equivalents: bool = True) -> Optional[int]:
if not include_equivalents:
return first_season_map[team]
Expand Down Expand Up @@ -385,3 +420,8 @@ def norm_positions(pos: Union[int, str], to_word: bool = False, to_number: bool
# lower() ok due to positional numbers being cast as strings when created
return normed.lower()

# pull out bref ID from player page link using a regex
def get_bref_id_from_player_link(player_link: str) -> str:

return re.search("players/[a-z]/([a-z0-9]+)\\.shtml", player_link).group(1)

3,369 changes: 3,369 additions & 0 deletions tests/pybaseball/data/active_roster.html

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions tests/pybaseball/test_active_roster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Callable

import pytest

from pybaseball import active_roster

@pytest.fixture(name="sample_html")
def _sample_html(get_data_file_contents: Callable[[str], str]) -> str:
return get_data_file_contents('active_roster.html')

def test_active_roster(response_get_monkeypatch: Callable, sample_html: str):
response_get_monkeypatch(sample_html)

with pytest.raises(ValueError) as ex_info:
active_roster('FAKE')
assert str(ex_info.value == 'Team must be the three-letter abbreviation of an active MLB team.')

active_roster_result = active_roster('WSN')

# make sure IL is populated
assert active_roster_result[active_roster_result["Name"] == "Cade Cavalli"]["IL"].values[0] == "15-day"

# make sure a player who hasn't played in the majors has Alt URL set correctly
assert active_roster_result[active_roster_result["Name"] == "Andry Lara"]["Alt URL"].values[0]
22 changes: 22 additions & 0 deletions tests/pybaseball/test_appearances_bref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest
from pybaseball.appearances_bref import appearances_bref

class TestAppearancesBref(unittest.TestCase):

def test_wrong_season_error(self):
# ensure error raised for season before 1871
self.assertRaises(ValueError, appearances_bref, 1870)

def test_year_with_no_awards(self):
# make sure results are retrieved with no error for a year where the awards column is empty / excluded
appearances_bref_result = appearances_bref(1871)

# test specific value in results
assert appearances_bref_result[appearances_bref_result["Player"] == "Dave Eggler"]["CF"].values[0] == "33"

def test_year_with_awards(self):
appearances_bref_result = appearances_bref(1913)

# test awards column
assert appearances_bref_result[appearances_bref_result["Player"] == "Walter Johnson"]["Awards"].values[0] == \
"MVP-1"
9 changes: 8 additions & 1 deletion tests/pybaseball/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import date, datetime, timedelta

import pytest
from bs4 import Tag

from pybaseball.utils import DATE_FORMAT, sanitize_date_range
from pybaseball.utils import DATE_FORMAT, sanitize_date_range, get_bref_id_from_player_link


def test_sanitize_date_range_nones() -> None:
Expand Down Expand Up @@ -52,3 +53,9 @@ def test_sanitize_date_range_start_dt_gt_end_dt() -> None:
assert start_dt_date < end_dt_date
assert str(start_dt_date) == end_dt
assert str(end_dt_date) == start_dt


def test_get_bref_id_from_player_link() -> None:
test_link_tag = Tag(name='a', attrs={'href': '/players/s/slapncy01.shtml'})

assert get_bref_id_from_player_link(test_link_tag) == 'slapncy01'