The following analysis was the basis for The Washington Post story: How baseball’s war on sticky stuff is already changing the game

knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(message = F)

# install packages if missing
packages <- c("tidyverse", "readxl", "janitor",
              "gghighlight", "lubridate", "DT", "knitr", "ggtext")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())), repos = "http://cran.us.r-project.org")  
}

# load libraries
library(tidyverse)
library(readxl)
library(janitor)
library(lubridate)
library(knitr)
library(ggrepel)
library(DT)
library(ggtext)

# Load data and specify pitch types, results, and outcomes
pitches <- read_csv("../../data/clean_data/combined_2017_2021.csv") %>%
  mutate(pitch_type=case_when(
    pi_pitch_type=="CH" ~ "changeup",
    pi_pitch_type=="SI" ~ "sinker",
    pi_pitch_type=="SL" ~ "slide",
    pi_pitch_type=="CU" ~ "knuckle curve",
    pi_pitch_type=="FA" ~ "fastball",
    pi_pitch_type=="FC" ~ "cutter",
    pi_pitch_type=="KN" ~ "knuckleball",
    pi_pitch_type=="SB" ~ "screwball",
    pi_pitch_type=="FS" ~ "sinking fastball/splitter",
    pi_pitch_type=="CS" ~ "curve, slow",
  ),
  pitch_results=case_when(
    pitch_result=="B" ~ "ball",
    pitch_result=="F" ~ "foul",
    pitch_result=="FB" ~ "fly ball/popup",
    pitch_result=="GB" ~ "ground ball",
    pitch_result=="LD" ~ "line drive",
    pitch_result=="PU" ~ "fly ball/popup",
    pitch_result=="S" ~ "strike"
  ),
  pitch_outcomes=case_when(
    pitch_outcome=="1B" ~ "first base",
    pitch_outcome=="2B" ~ "second base",
    pitch_outcome=="3B" ~ "third base",
    pitch_outcome=="B" ~ "ball",
    pitch_outcome=="F" ~ "foul",
    pitch_outcome=="HR" ~ "homerun",
    pitch_outcome=="OUT" ~ "out",
    pitch_outcome=="S" ~ "strike",
    pitch_outcome=="HBP" ~ "hit by pitch",
    pitch_outcome=="NIP" ~ "not in play",
    pitch_outcome=="RBOE" ~ "reached base on error"
  )) %>% 
  mutate(bauer_units=rpm/mph)

# Bring in salary data (artifact of older analysis, not necessary in this one)
salaries <- read_csv("../../data/clean_data/salary_lut.csv") %>% 
  mutate(salary=gsub("\\$", "", salary),
         salary=as.numeric(gsub(",", "", salary))) %>% 
  mutate(rank=rank(desc(salary)))

# figuring out the highest 30 paid pitchers
salaries30 <- salaries %>% 
  arrange(desc(salary)) 

salaries30 <- head(salaries, 30)

# isolating the names of the top 30-paid pitchers
sel_names <- as.character(salaries30$mlbid)

Toplines

All mentions of spin rate are adjusted for velocity.

  • The average spin rates for fastballs from pitchers have been climbing year over year since 2017 but in the three weeks after June 3 of 2021, the average to levels lower than they’ve ever been before spin data began being tracked

*Nearly 70% of pitchers who threw more than 10 fastballs after June 3 saw a decrease in their average adjusted spin rates compared to the earlier 2021 season

  • The data is on spin is new– less than five years of data has been collected. Experts are still trying to determine the effects of really high spin rates, but the data shows that spin rates have of many pitchers increased year over year since 2017

  • For many, the spin rates peaked in 2020 and 2021 but has since declined after June 3 (nearly half the players)

    • Like Trevor Bauer, Gerrit Cole, Brad Boxberger, Lucas Sims, Tyler Mahle, and Adam Ottavino (and Junior Guerra and Burch Smith)
  • Some teams in particular have also been increasing their spin rates every season since 2017

    • More than half of teams have been growing spin rates and saw a drop after June 3
    • A handful of them, such as the Cincinnati Reds and Brewers have seen a dramatic drop after June 3
  • Some teams saw increase in the spin rates of the pitchers who joined them compared to their prior team the year before.

    • The average spin rate increase is .35 when pitchers get traded to a new team but there are 8 teams that are more than 3 -5 times that
    • The Astros, Brewers, Cardinals, and Padres each had notable increases in recent years
  • It’s still too early to say what the effects of lower spin rates have had on games

    • The average number of hard hits from fast ball pitches have increased on average per game
#Average adjusted spin rate by time period

ba <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) 

1. Spin rates by season

The average spin rates for fastballs from pitchers have been climbing year over year since 2017 but in the three weeks after June 3 of 2021, the average to levels lower than they’ve ever been before spin data began being tracked.

Chart

# charting the time periods

ba %>% ggplot(aes(x=when, y=avg_bu)) +
  geom_col() +
  #ylim(20, 25) +
  coord_cartesian(ylim = c(20, 25)) +
  theme_minimal() +
  labs(x="", y="Average RPM", title="Average adjusted fastball spin rate by season")

#write_csv(ba, "../outputs/graphics_data/annual_average_trend.csv", na="")
#ggsave("outputs/graphics_data/annual_average_trend.svg", width=10, height=5)

Table

#table tab

kable(ba)
when avg_rpm avg_bu
2017 2256.5 24.16
2018 2259.7 24.18
2019 2283.4 24.44
2020 2301.6 24.62
2021 2314.7 24.69
after June 3 2259.0 24.09

2. Sanity check

The drop is still stark even when limiting the universe to before June 3 and June 3 and June 24 (which is how far we have in the data).

# adding even more time frames

ba_restricted <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 & game_date < ymd("2017-06-03") ~ "2017 before",
    year(game_date)==2017 & game_date > ymd("2017-06-03") & game_date <= ymd("2021-06-17")~ "2017 after",
    
    year(game_date)==2018 & game_date < ymd("2018-06-03") ~ "2018 before",
    year(game_date)==2018 & game_date > ymd("2018-06-03") & game_date <= ymd("2021-06-17")~ "2018 after",
    
    year(game_date)==2019 & game_date < ymd("2019-06-03") ~ "2019 before",
    year(game_date)==2019 & game_date > ymd("2019-06-03") & game_date <= ymd("2019-06-17")~ "2019 after",
    
    year(game_date)==2020 & game_date < ymd("2020-06-03") ~ "2020 before",
    year(game_date)==2020 & game_date > ymd("2020-06-03") & game_date <= ymd("2020-06-17")~ "2020 after",
    
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021 before",
    year(game_date)==2021 & game_date > ymd("2021-06-03") & game_date <= ymd("2021-06-17")~ "2021 after",
    
    TRUE ~ "missing"
  )) %>% 
  group_by(when) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),1)) %>% 
  filter(when!="missing") %>% 
  mutate(year=gsub(" .*", "", when),
         type=gsub(".* ", "", when)) %>% 
  mutate(year=as.character(year)) %>% 
  mutate(type_order=case_when(
    type=="before" ~ 1,
    type=="after" ~ 2
  ))

# slopegraph
ba_restricted %>% 
  ggplot(aes(x=fct_reorder(type, type_order), y=avg_bu, group=year, color=year)) +
  geom_line() +
  # NEW CODE BELOW
  #gghighlight(year) +
  theme_minimal() +
  labs(y="Average Bauer Units",
       x="Before and after June 3 of each year",
       title="Average fastball adjusted spin rates for each season through June 16")

3. Pitchers who improved but then dropped

# looking only at fastball pitches adjusted spin rates before and after in 2021
# and then calculating the percent change between those two time periods

ba <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  filter(year==2021) %>% 
  mutate(when=case_when(
    game_date >= ymd("2021-06-03") ~ "after",
    TRUE ~ "before"
  )) %>% 
  group_by(pitcher_team, pitcher_mlbid, pitcher_name, when) %>% 
  summarize(avg_bu=mean(bauer_units, na.rm=T)) %>% 
  pivot_wider(names_from="when", values_from="avg_bu") %>% 
  mutate(percent_change=round((after-before)/before*100,1))

# looking only at fastball pitches adjusted spin rates before and after in 2021
# and counting up the number of fastballs thrown in each period

ba2 <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  filter(year==2021) %>% 
  mutate(when=case_when(
    game_date >= ymd("2021-06-03") ~ "after",
    TRUE ~ "before"
  )) %>% 
  mutate(when_count=case_when(
    game_date >= ymd("2021-06-03") ~ 1,
    TRUE ~ 0
  )) %>% 
  group_by(pitcher_team, pitcher_mlbid, pitcher_name) %>% 
  summarize(fastballs_total=n(),
            fastballs_after=sum(when_count)) 

# joining the two dataframes together
ba3 <- left_join(ba, ba2)# %>% 
 # mutate(pitcher_mlbid=as.character(pitcher_mlbid))

# bringing in salary data for no reason anymore
ba3 <- ba3 %>% 
   left_join(salaries, by=c("pitcher_mlbid"="mlbid"))

# looking only at pitchers who've pitched more than 10 times after June 3
# and ranking their percent change before and after June 3
ba3 <- ba3 %>% 
  rename(salary_rank=rank) %>% 
  filter(fastballs_after>=10) %>% 
  ungroup() %>% 
  mutate(bu_rank=rank(percent_change))

# calculating average adjusted fastball spin rates for each pitcher during each time period
year_over_year <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when, pitcher_name, pitcher_mlbid) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) 

# calculating average adjusted fastball spin rates for pitchers for each time period 
# but this time filtering for a pitcher's first year as long as it isn't 2020
year_over_year1 <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when, pitcher_name, pitcher_mlbid) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) %>% 
  filter(when!="missing" & when!="after June 3") %>% 
  group_by(pitcher_name, pitcher_mlbid) %>% 
  arrange(when) %>% 
  slice(1) %>% 
  mutate(year_off="earliest year") %>% 
  filter(when!="2020")

# calculating average adjusted fastball spin rates for pitchers for each time period 
# but this time filtering 2020
year_over_year2 <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when, pitcher_name, pitcher_mlbid) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) %>% 
  filter(when=="2020")  %>% 
  mutate(year_off="last year")

# joining these two annual dataframes
# and then spreading out the data so it's easier to calculate the percent change
# from earliest year and 2020
year_over_year3 <- rbind(year_over_year1, year_over_year2) 
year_over_year3 <- year_over_year3 %>%
  ungroup() %>% 
  select(-when, -avg_rpm) %>% 
  pivot_wider(names_from="year_off", values_from="avg_bu")

year_over_year3 <- year_over_year3 %>% 
  mutate(annual_growth=round((`last year`-`earliest year`)/`earliest year`*100,2))

# filtering this for more than 10 pitches after June 3
ba4 <- full_join(ba3 %>% filter(fastballs_after>=10), year_over_year3)

# adding some labels to indicate if adjusted spin rates increased or decreased
ba_aggregate <- ba3 %>% 
  filter(fastballs_after>=10) %>% 
  mutate(more_less=case_when(
    percent_change >0 ~ "Increased",
    percent_change <0 ~ "Decreased",
    TRUE ~ "Other"
  )) %>% 
  count(more_less) %>% 
  mutate(percent=round(n/sum(n)*100,2))

# looking at those who've decreased
decreased <- filter(ba_aggregate, more_less=="Decreased") %>% 
  pull(percent)

# not using anywhere but just wanted to see
# only the pitchers with data in the two fields we care about
# percent change from their earliest year to 2020
# and percent change from 2021 to post June 3 2021
ba4_aggregate <- ba4 %>%
  filter(fastballs_after>=10) %>% 
  filter(!is.na(percent_change) & !is.na(annual_growth))  %>% 
  mutate(changed=case_when(
    annual_growth >0 & percent_change <0 ~ 1,
    TRUE ~ 0
  )) %>% 
  summarize(changed_total=sum(changed),
            total=n())

Nearly (70%) of pitchers who threw more than 10 fastballs after June 3 saw a decrease in their average adjusted spin rates compared to the earlier 2021 season.

The drop is interesting when compared to how pitchers have increased their spin rates through the years: 131 out of 262 – nearly half.

Chart

# fancy scatte rplot time

ba4_chart <- ba4 %>% 
  filter(!is.na(annual_growth) & !is.na(percent_change)) %>% 
  ggplot(aes(x=annual_growth, y=percent_change)) +
  geom_rect(aes(xmin=0, xmax=Inf, ymin=-Inf,
                ymax=0), color="transparent", fill="orange", alpha=0.8) +
  geom_point() +
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0) +
  theme_minimal() +
  annotate("text", x = 12, y = 4, label = "Spin rate increased\nyear over year and\nincreased after June 3") +
    annotate("text", x = -3, y = 4, label = "Spin rate decreased\nyear over year but\nincreased after June 3") +
   annotate("text", x = -3, y = -8, label = "Spin rate decreased\nyear over year and\nalso fell after June 3") +
  annotate("text", x = 15, y = -8, label = "Spin rate increased\nyear over year but\nfell after June 3") +
  geom_text_repel(data = . %>% 
                  mutate(label = ifelse(percent_change < -4 & annual_growth >9,
                                          pitcher_name, "")),
                  aes(label = label), 
                  box.padding = 1,
                  show.legend = FALSE) +
  labs(title="Pitchers' spin rate then versus spin rate now",
       subtitle="Average spin rate change over career compared to average spin rate change after June 3.\nOnly pitchers who've thrown at least 10 fastballs since June 3 included.",
       x="Spin rate change over the years",
       y="Spin rate change after June 3")

ba4_chart

#ggsave(ba4_chart, "../graphics_data/scatterplot.svg", width=6, height=6)

Table

# table of the data used in the scatterplot
ba4 %>%  
  mutate(before=round(before, 1),
         after=round(after, 1)) %>% 
  select(team=pitcher_team, pitcher=pitcher_name,
         before, after,
         percent_change, 
         `earliest year`,
         `last year`,
         annual_growth) %>% 
  datatable(extensions = c("Buttons"),     
                   options = list(dom = 'Bfrtip',
                             buttons = list(list(extend = "csv", 
                                                 text = "Download Table",
                                                 filename = "spin_rate_then_spin_rate_now.csv",
                             exportOptions = list(modifier = list(page = "all"))))))

5. Teams analysis

Some teams in particular have also been increasing their spin rates every season since 2017.

A handful of them, such as the Cincinatti Reds and Brewers have seen a dramatic drop after June 3.

# same thing abovewith pitchers but focused on teams
# and joining with official team name instead of abbreviations

team_names_df <- read_csv("../../data/clean_data/team_names.csv")

ba <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  filter(year==2021) %>% 
  mutate(when=case_when(
    game_date >= ymd("2021-06-03") ~ "after",
    TRUE ~ "before"
  )) %>% 
  group_by(pitcher_team, when) %>% 
  summarize(avg_bu=mean(bauer_units, na.rm=T)) %>% 
  pivot_wider(names_from="when", values_from="avg_bu") %>% 
  mutate(percent_change=round((after-before)/before*100,1))


ba2 <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  filter(year==2021) %>% 
  mutate(when=case_when(
    game_date >= ymd("2021-06-03") ~ "after",
    TRUE ~ "before"
  )) %>% 
  mutate(when_count=case_when(
    game_date >= ymd("2021-06-03") ~ 1,
    TRUE ~ 0
  )) %>% 
  group_by(pitcher_team) %>% 
  summarize(fastballs_total=n(),
            fastballs_after=sum(when_count)) 

ba3 <- left_join(ba, ba2)# %>% 

ba3 <- ba3 %>% 
  filter(fastballs_after>=10) %>% 
  ungroup() %>% 
  mutate(bu_rank=rank(percent_change))

year_over_year <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when, pitcher_team) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) 

year_over_year1 <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when, pitcher_team) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) %>% 
  filter(when!="missing" & when!="after June 3") %>% 
  group_by(pitcher_team) %>% 
  arrange(when) %>% 
  slice(1) %>% 
  mutate(year_off="earliest year") %>% 
  filter(when!="2020")


year_over_year2 <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when, pitcher_team) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) %>% 
  filter(when=="2020")  %>% 
  mutate(year_off="last year")

year_over_year3 <- rbind(year_over_year1, year_over_year2) 
year_over_year3 <- year_over_year3 %>%
  ungroup() %>% 
  select(-when, -avg_rpm) %>% 
  pivot_wider(names_from="year_off", values_from="avg_bu")

year_over_year3 <- year_over_year3 %>% 
  mutate(annual_growth=round((`last year`-`earliest year`)/`earliest year`*100,2))

ba4 <- full_join(ba3 %>% filter(fastballs_after>=5), year_over_year3)

ba4 <- left_join(ba4, team_names_df)

Chart

ba4 %>% 
  filter(!is.na(annual_growth) & !is.na(percent_change)) %>% 
  ggplot(aes(x=annual_growth, y=percent_change)) +
  geom_rect(aes(xmin=0, xmax=Inf, ymin=-Inf,
                ymax=0), color="transparent", fill="orange", alpha=0.8) +
  geom_point() +
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0) +
  theme_minimal() +
#  annotate("text", x = 12, y = 4, label = "Spin rate increased\nyear over year and\nincreased after June 3") +
#    annotate("text", x = -5, y = 4, label = "Spin rate decreased\nyear over year but\nincreased after June 3") +
#   annotate("text", x = -5, y = -8, label = "Spin rate decreased\nyear over year and\nalso fell after June 3") +
#  annotate("text", x = 15, y = -8, label = "Spin rate increased\nyear over year but\nfell after June 3") +
  geom_text_repel(data = . %>% 
                  mutate(label = ifelse(percent_change < -1 & annual_growth >3,
                                          pitcher_team_name, "")),
                  aes(label = label), 
                  box.padding = 1,
                  show.legend = FALSE) +
  labs(title="Team pitchers' spin rate then versus spin rate now",
       subtitle="Average spin rate change over career compared to average spin rate change after June 3.\nOnly pitchers who've thrown at least 5 fastballs since June 3 included.",
       x="Spin rate change over the years",
       y="Spin rate change after June 3")

Table

ba4 %>% 
  mutate(before=round(before, 1),
         after=round(after, 1)) %>% 
  select(team=pitcher_team_name,
         before, after,
         percent_change, 
         `earliest year`,
         `last year`,
         annual_growth) %>% 
  datatable(extensions = c("Buttons"),     
                   options = list(dom = 'Bfrtip',
                             buttons = list(list(extend = "csv", 
                                                 text = "Download Table",
                                                 filename = "team_spin_rate_then_spin_rate_now.csv",
                             exportOptions = list(modifier = list(page = "all"))))))

7. Team influence

Some teams see a notable increase in the pitchers who join their team compared to their prior ones. The Astros, Brewers, Cardinals, and Padres each saw their new pitchers grow.

Chart

# looking only at fastballs

# fancy wrangling here to determine of a player has switched teams between seasons
pitcher_teams <- pitches %>% 
  left_join(team_names_df) %>% 
  filter(pitch_type=="fastball") %>% 
  group_by(pitcher_team_name, pitcher_name, year) %>% 
  summarize(average_bu=mean(bauer_units, na.rm=T),
            fastball_pitches=n()) %>% 
  filter(fastball_pitches>=20) %>% 
  arrange(pitcher_name, year) %>% 
  group_by(pitcher_name) %>% 
  mutate(lag_average_bu=lag(average_bu, 1),
         lag_team=lag(pitcher_team_name)) %>% 
  mutate(growth=round((average_bu-lag_average_bu)/lag_average_bu*100,2)) %>% 
  mutate(new_team=case_when(
    lag_team!=pitcher_team_name & !is.na(lag_team) ~ T,
    TRUE ~ F
  )) %>% 
  filter(new_team==T) %>% 
  group_by(pitcher_team_name, year) %>% 
  summarize(average_bu_growth=mean(growth, na.rm=T),
            pitchers=n())

pitcher_teams %>% 
  ggplot(aes(x=year, y=average_bu_growth)) +
  geom_col() +
  facet_wrap(~pitcher_team_name, ncol=4) +
  labs(title="Average spin rate growth for new pitchers on the team",
       subtitle="For pitchers who threw more than 20 fastballs") +
  theme_minimal() 

Table

 pitches %>% 
  left_join(team_names_df) %>% 
  filter(pitch_type=="fastball") %>% 
  group_by(pitcher_team_name, pitcher_name, year) %>% 
  summarize(average_bu=mean(bauer_units, na.rm=T),
            fastball_pitches=n()) %>% 
  filter(fastball_pitches>=20) %>% 
  arrange(pitcher_name, year) %>% 
  group_by(pitcher_name) %>% 
  mutate(lag_average_bu=lag(average_bu, 1),
         lag_team=lag(pitcher_team_name)) %>% 
  mutate(growth=round((average_bu-lag_average_bu)/lag_average_bu*100,2)) %>% 
  mutate(new_team=case_when(
    lag_team!=pitcher_team_name & !is.na(lag_team) ~ T,
    TRUE ~ F
  )) %>% 
  filter(new_team==T) %>% 
  group_by(pitcher_team_name) %>% 
  summarize(average_bu_growth=round(mean(growth, na.rm=T),2),
            pitchers=n()) %>% 
  rename(team=pitcher_team_name, `average spin growth`=average_bu_growth) %>% 
  arrange(desc(`average spin growth`)) %>% 

  datatable(extensions = c("Buttons"),     
                   options = list(dom = 'Bfrtip',
                             buttons = list(list(extend = "csv", 
                                                 text = "Download Table",
                                                 filename = "team_growth_new_pitchers.csv",
                             exportOptions = list(modifier = list(page = "all"))))))

8. Effects of spin on hits

# not using this
pitch_percent_daily <- pitches %>% 
  filter(year==2021) %>% 
  filter(strikes_before==2) %>% 
  #filter(pitch_type=="fastball") %>% 
  unique() %>% 
  group_by(game_date, pitcher_mlbid, at_bat_index) %>% 
  #top_n(-1) %>% 
  ungroup() %>% 
  count(year, game_date, pitch_outcomes) %>% 
  group_by(game_date) %>% 
  mutate(percent=n/sum(n, na.rm=T)*100) %>%
  filter(pitch_outcomes %in% c("ball", "strike", "out", "first base"))
  #filter(!pitch_outcomes %in% c("knuckleball", "screwball", "curve, slow", "cutter"))

#write_csv(pitch_percent_daily, "../graphics_data/pitch_percent_daily.csv", na="")

pitch_percent_daily %>% 
  ggplot(aes(x=game_date, y=percent, group=pitch_outcomes, color=pitch_outcomes, fill=pitch_outcomes)) +
  geom_col() +
  facet_wrap(~pitch_outcomes, scales="free_y", ncol=2) +
  theme_minimal() +
    geom_smooth(method = "loess", formula = 'y ~ x', se = FALSE, color="grey") + 
  geom_vline(xintercept=ymd("2021-06-03")) +
  theme(legend.position = "none") +
  labs(title="Outcomes of fastball pitches after 2 strikes by game in 2021")

The results of the crackdown are also being seen at the plate. Fewer at-bats after two strikes are ending in walks or strikes after June 3, a review of recent data shows, and more than 44 percent of them are ending with contact — up from 40 percent earlier in the season.

# from time periods
# but looking at contact
# so filtering only to swings that dont lead to strikes

pitch_percent_daily <- pitches %>% 
  #filter(year==2021) %>% 
  filter(strikes_before==2) %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  #group_by(when) %>% 
  #filter(pitch_type=="cutter") %>% 
  #filter(pitch_type=="fastball") %>% 
  #(pitcher_name_last=="Bauer") %>% 
  unique() %>% 
  mutate(contact=case_when(
    swing_type=="swing" & pitch_results!="strike" ~ T,
    TRUE ~ F
  )) %>% 
  group_by(game_date, pitcher_mlbid, at_bat_index) %>% 
  #top_n(-1) %>% 
  ungroup() %>% 
  #count(year, game_date, contact) %>% 
  count(when, contact) %>% 
  group_by(when) %>% 
  mutate(percent=n/sum(n, na.rm=T)*100) %>% 
  filter(contact==T)


pitch_percent_daily %>% 
  ggplot(aes(x=when, y=percent, group=contact, color=contact, fill=contact)) +
  geom_col() +
  #facet_wrap(~contact, ncol=2) +
  theme_minimal() +
  theme(legend.position = "none") +
  coord_cartesian(ylim = c(20, 48)) +
  theme_minimal() +
  #labs(title="Contact during last pitches at bat by game ")
  labs(title="Contact rate on two-strike pitches")

#ignore this chunk

pitch_percent_daily <- pitches %>% 
  filter(year==2021) %>% 
    filter(strikes_before==2) %>% 

  unique() %>% 
  mutate(contact=case_when(
    swing_type=="swing" & pitch_results!="strike" ~ T,
    TRUE ~ F
  )) %>% 
  group_by(game_date, pitcher_mlbid, at_bat_index) %>% 
  #top_n(-1) %>% 
  ungroup() %>% 
  #count(year, game_date, contact) %>% 
  count(game_date, contact) %>% 
  group_by(game_date) %>% 
  mutate(percent=n/sum(n, na.rm=T)*100) %>% 
  filter(contact==T)


pitch_percent_daily %>% 
  ggplot(aes(x=game_date, y=percent, group=contact, color=contact, fill=contact)) +
  geom_col() +
  #facet_wrap(~contact, ncol=2) +
  theme_minimal() +
  theme(legend.position = "none") +
    geom_vline(xintercept=ymd("2021-06-03")) +

  #labs(title="Contact during last pitches at bat by game ")
  labs(title="Contact rate on two-strike pitches in 2021")

#write_csv(pitch_percent_daily, "outputs/graphics_data/contact_game_number.csv", na="")
# same as above but looking at contact daily
pitch_percent_daily_dates <- pitches %>% 
  select(year, game_date) %>% 
  arrange(year, game_date) %>% 
  unique() %>% 
  group_by(year) %>% 
  mutate(game_id=row_number()) 


pitch_percent_daily <- pitches %>% 
  filter(strikes_before==2) %>% 
  unique() %>% 
  mutate(contact=case_when(
    swing_type=="swing" & pitch_results!="strike" ~ T,
    TRUE ~ F
  )) %>% 
  group_by(game_date, pitcher_mlbid, at_bat_index) %>% 
  #top_n(-1) %>% 
  ungroup() %>% 
  left_join(pitch_percent_daily_dates) %>% 
  #count(year, game_date, contact) %>% 
  count(year, game_id, contact) %>% 
  group_by(year, game_id) %>% 
  mutate(percent=n/sum(n, na.rm=T)*100) %>% 
  filter(contact==T) %>% 
  mutate(year=as.character(year))

#pitch_percent_daily %>% 
#  ggplot(aes(x=game_id, y=percent, group=year, color=year, fill=year)) +
#  geom_line() +
#  theme_minimal() +
#  geom_vline(xintercept=81) +
#  labs(title="Contact rate on two-strike pitches")


#pitch_percent_daily %>% 
  ggplot() +
  geom_point(data=pitch_percent_daily %>% filter(year %in% c("2017", "2018", "2019")), aes(x=game_id, y=percent, group=year), color="gray", alpha=.8, size=1) +
  geom_line(data=pitch_percent_daily %>% filter(year %in% c("2020", "2021")), aes(x=game_id, y=percent, group=year, color=year)) +
  geom_point(data=pitch_percent_daily %>% filter(year %in% c("2020", "2021")), aes(x=game_id, y=percent, group=year, color=year), size=1) +
  #facet_wrap(~contact, ncol=2) +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_vline(xintercept=64, color="#5768c3") +
  #labs(title="Contact during last pitches at bat by game ")
   labs(
    x = "Game number",
    y = "Percent contact",
    title = "<span style = 'font-size:14pt; font-family:Helvetica;'>Contact rate on two-strike pitches</span><br>
The percent of batters making contact on pitches after two strikes were in decline in <span style = 'color:#F8766D;'>2020</span> and <span style = 'color:#01BFC4;'>2021</span><br /> compared to <span style = 'color:#BEBEBE;'>previous years</span> but climbed back up after <span style= 'color:#5768c3;'>June 3</span>"
  ) +
  #theme_bw() +
  theme(
    text = element_text(family = "Helvetica"),
    plot.title.position = "plot",
    plot.title = element_markdown(size = 11, lineheight = 1.2)
  )

#ignore everything below

#ggsave("outputs/graphics_data/contact_rates.pdf", width=6, height=4)
#ggsave("outputs/graphics_data/contact_rates.pdf", width=6, height=4)

  
#pitch_percent_daily %>% 
  ggplot() +
  geom_line(data=pitch_percent_daily, aes(x=game_id, y=percent, group=year, color=year)) +
  #facet_wrap(~contact, ncol=2) +
  theme_minimal() +
  #theme(legend.position = "none") +
  geom_vline(xintercept=64)



pitch_percent_daily <- pitches %>% 
  filter(year==2021) %>% 
  filter(strikes_before==2) %>% 

  #filter(pitch_type=="cutter") %>% 
  unique() %>% 
  group_by(game_date, pitcher_mlbid, at_bat_index) %>% 
  #top_n(-1) %>% 
  ungroup() %>% 
  count(year, game_date, pitch_outcomes) %>% 
  group_by(game_date) %>% 
  mutate(percent=n/sum(n, na.rm=T)*100) #%>%
  #filter(pitch_outcomes %in% c("ball", "strike", "out", "first base"))
  #filter(!pitch_outcomes %in% c("knuckleball", "screwball", "curve, slow", "cutter"))

#write_csv(pitch_percent_daily, "../graphics_data/pitch_percent_daily.csv", na="")

pitch_percent_daily %>% 
  ggplot(aes(x=game_date, y=percent, group=pitch_outcomes, color=pitch_outcomes, fill=pitch_outcomes)) +
  geom_col() +
  facet_wrap(~pitch_outcomes, ncol=2) +
  theme_minimal() +
    geom_smooth(method = "loess", formula = 'y ~ x', se = FALSE, color="grey") + 
  geom_vline(xintercept=ymd("2021-06-03")) +
  theme(legend.position = "none") +
  labs(title="Outcomes of last pitches at bat by game in 2021")




pitch_percent_daily <- pitches %>% 
  filter(year==2021) %>% 
  #filter(strikes_before==2) %>% 

  #filter(pitch_type=="cutter") %>% 
  unique() %>% 
  group_by(game_date, pitcher_mlbid, at_bat_index) %>% 
  top_n(-1) %>% 
  ungroup() %>% 
  count(year, game_date, pitch_results) %>% 
  group_by(game_date) %>% 
  mutate(percent=n/sum(n, na.rm=T)*100) #%>%
  #filter(pitch_outcomes %in% c("ball", "strike", "out", "first base"))
  #filter(!pitch_outcomes %in% c("knuckleball", "screwball", "curve, slow", "cutter"))

#write_csv(pitch_percent_daily, "../graphics_data/pitch_percent_daily.csv", na="")

pitch_percent_daily %>% 
  ggplot(aes(x=game_date, y=percent, group=pitch_results, color=pitch_results, fill=pitch_results)) +
  geom_col() +
  facet_wrap(~pitch_results, ncol=2) +
  theme_minimal() +
    geom_smooth(method = "loess", formula = 'y ~ x', se = FALSE, color="grey") + 
  geom_vline(xintercept=ymd("2021-06-03")) +
  theme(legend.position = "none") +
  labs(title="Results of last pitches at bat by game in 2021")


#write_csv(pitch_percent_daily, "outputs/graphics_data/pitch_outcomes_2021.csv", na="")

#ggsave("outputs/graphics_data/pitch_outcomes_2021.svg", width=6, height=4)

#ggsave("outputs/graphics_data/pitch_outcomes_2021.pdf", width=6, height=4)
#game_pk
#at_bat_index
#pitch_of_ab
#strikes_before
#pitch_outcomes


pitches %>% 
  filter(strikes_before==2) %>% 

  #filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  count(when, pitch_type) %>%
  group_by(when) %>% 
  mutate(percent=round(n/sum(n, na.rm=T)*100,2)) %>% 
  ggplot(aes(x=when, y=percent)) +
  geom_col() +
  facet_wrap(~pitch_type)+
  labs(title="Pitch type percent after the second strike")



pitches %>% 
  filter(strikes_before==2) %>% 

  #filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  count(game_date, pitcher_name, when, pitch_type, pitch_results) %>% 
  filter(!is.na(pitch_results)) %>% 
  group_by(when, pitch_type, pitch_results) %>% 
  summarize(average_results=mean(n, na.rm=T)) %>% 
  ggplot(aes(x=when, y=average_results)) +
  geom_col() +
  facet_grid(pitch_type~pitch_results)+
  labs(title="Pitch results by type")


pitches %>% 
  filter(strikes_before==2) %>% 

  #filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  count(game_date, pitcher_name, when, pitch_type, pitch_results) %>% 
  filter(!is.na(pitch_results)) %>% 
  group_by(when, pitch_type, pitch_results) %>% 
  summarize(total=n()) %>%
  group_by(when, pitch_type) %>% 
  mutate(percent=round(total/sum(total, na.rm=T)*100,2)) %>% 
  ggplot(aes(x=when, y=percent)) +
  geom_col() +
  facet_grid(pitch_type~pitch_results)+
  labs(title="Pitch results by type")

pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  count(game_date, pitcher_name, when, pitch_outcomes) %>% 
  filter(!is.na(pitch_outcomes)) %>% 
  group_by(when, pitch_outcomes) %>% 
  summarize(average_outcomes=mean(n, na.rm=T)) %>% 
  ggplot(aes(x=when, y=average_outcomes)) +
  geom_col() +
  facet_wrap(~pitch_outcomes, ncol=3) +
  labs(title="Average number of fastball pitch outcomes per game")
pitches %>% 
  filter(pitch_type=="fastball") %>% 
  filter(hardness=="hard") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  count(game_date, pitcher_name, when, hardness) %>% 
  filter(!is.na(hardness)) %>% 
  group_by(when, hardness) %>% 
  summarize(average_outcomes=mean(n, na.rm=T)) %>% 
  ggplot(aes(x=when, y=average_outcomes)) +
  geom_col() +
  facet_wrap(~hardness) +
  coord_cartesian(ylim = c(1, 1.5)) +

  labs(title="Average number of hardness pitch outcomes per game")

ba <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 ~ "2017",
    year(game_date)==2018 ~ "2018",
    year(game_date)==2019 ~ "2019",
    year(game_date)==2020 ~ "2020",
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021",
    game_date >= ymd("2021-06-03") ~ "after June 3",
    TRUE ~ "missing"
  )) %>% 
  group_by(when) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),2)) 

ba_restricted <- pitches %>% 
  filter(pitch_type=="fastball") %>% 
  mutate(when=case_when(
    year(game_date)==2017 & game_date < ymd("2017-06-03") ~ "2017 before",
    year(game_date)==2017 & game_date > ymd("2017-06-03") & game_date <= ymd("2021-06-17")~ "2017 after",
    
    year(game_date)==2018 & game_date < ymd("2018-06-03") ~ "2018 before",
    year(game_date)==2018 & game_date > ymd("2018-06-03") & game_date <= ymd("2021-06-17")~ "2018 after",
    
    year(game_date)==2019 & game_date < ymd("2019-06-03") ~ "2019 before",
    year(game_date)==2019 & game_date > ymd("2019-06-03") & game_date <= ymd("2019-06-17")~ "2019 after",
    
    year(game_date)==2020 & game_date < ymd("2020-06-03") ~ "2020 before",
    year(game_date)==2020 & game_date > ymd("2020-06-03") & game_date <= ymd("2020-06-17")~ "2020 after",
    
    year(game_date)==2021 & game_date < ymd("2021-06-03") ~ "2021 before",
    year(game_date)==2021 & game_date > ymd("2021-06-03") & game_date <= ymd("2021-06-17")~ "2021 after",
    
    TRUE ~ "missing"
  )) %>% 
  group_by(when) %>% 
  summarize(avg_rpm=round(mean(rpm, na.rm=T),1),
            avg_bu=round(mean(bauer_units, na.rm=T),1)) %>% 
  filter(when!="missing") %>% 
  mutate(year=gsub(" .*", "", when),
         type=gsub(".* ", "", when)) %>% 
  mutate(year=as.character(year)) %>% 
  mutate(type_order=case_when(
    type=="before" ~ 1,
    type=="after" ~ 2
  ))

#pitch_results
#pitch_outcomes
#hardness