import warnings
from altair.utils.deprecation import AltairDeprecationWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=AltairDeprecationWarning)

stack.properties(title="Click on any legend to filter all visuals by player")

import pandas as pd
import altair as alt

df = pd.read_csv("nhlgoalies.csv")
df.head(3)

df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1544 entries, 0 to 1543
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Player  1544 non-null   object        
 1   Date    1544 non-null   datetime64[ns]
 2   Age     1544 non-null   object        
 3   Team    1544 non-null   object        
 4   Loc     773 non-null    object        
 5   Opp     1544 non-null   object        
 6   Result  1544 non-null   object        
 7   DEC     1470 non-null   object        
 8   MIN     1544 non-null   object        
 9   GA      1544 non-null   int64         
 10  SV      1544 non-null   int64         
 11  Shots   1544 non-null   int64         
 12  SV%     1543 non-null   float64       
 13  GAA     1544 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(8)
memory usage: 169.0+ KB

df['Loc'] = df['Loc'].notna().map({True: 'away', False: 'home'})
df.head(3)

df['minutes']= df['MIN'].str.split(':').str[0].astype(int)
df = df.drop(columns={"Result","DEC", "Opp"})

df.head(1)

#Filter the DF by top 10 goalies by highest avg SV%, making sure they have played atleast 
#20+ games to avoid getting goalies that played only 1 or two games resulting in a 1.0 SV avg.

# count games per goalie
games_played = df['Player'].value_counts()
# pick goalies with at least 20 games
eligible = games_played[games_played >= 20].index
# subset to those goalies
df_eligible = df[df['Player'].isin(eligible)]

top10 = (
  df_eligible
    .groupby('Player', as_index=False)['SV%']
    .mean()
    .nlargest(10, 'SV%')
)
df_top10 = df[df['Player'].isin(top10['Player'])]

print("Top 10 Goalies by AVG SV%: ")
df_top10.groupby('Player')[['SV%','GAA']].mean().sort_values(by = "SV%",ascending=False)

Top 10 Goalies by AVG SV%:

#making a scatter plot of SV% and GAA by player
select_legend = alt.selection_multi(fields=["Player"], bind="legend")

chart1= alt.Chart(df_top10).mark_circle().encode(
    x       = alt.X("GAA:Q"),
    y       = alt.Y("SV%:Q", scale=alt.Scale(domain=[0.6, 1.0])),
    color   = alt.Color("Player:N"),         # color by player
    opacity = alt.condition(
                  select_legend,
                  alt.value(1),               # full opacity when selected
                  alt.value(0.2)              # dim when not
              ),
    tooltip = ["Player:N", "GAA:Q", "SV%:Q","Team:N"]
).add_selection(
    select_legend
).interactive()

#Making a bar chart showing the player with the highest mean SV%
chart2= alt.Chart(df_top10).mark_bar().encode(
    y='Player:N',
    x=alt.X('mean(SV%):Q',
            scale=alt.Scale(domain=[0.88, 0.94])
    ),
    color="Player:N",
    opacity = alt.condition(
                  select_legend,
                  alt.value(1),               # full opacity when selected
                  alt.value(0.2)              # dim when not
              ),
    tooltip=["Player:N","mean(SV%):Q","Team:N"]
).add_selection(
    select_legend
).interactive()

#make timeseries of GAA comparing players

# aggregate (if you have multiple entries per date)
df_trends = (
    df_top10
    .groupby(['Player','Team','Date'], as_index=False)
    .agg(GAA=('GAA','mean'))
)
chart3= alt.Chart(df_trends).mark_line(point=True).encode(
    x='Date:T',
    y='GAA:Q',
    color='Player:N',
    opacity = alt.condition(
                  select_legend,
                  alt.value(1),               # full opacity when selected
                  alt.value(0.2)              # dim when not
              ),
     tooltip = ["Player:N", "GAA:Q","Team:N", "Date:T"]
).properties(
    title="Seasonal GAA Trends by Goalie"
).add_selection(
    select_legend
).interactive()

chart4= alt.Chart(df_top10).mark_bar().encode(
    y='Player:N',
    x=alt.X('mean(GAA):Q',
           
    ),
    color="Player:N",
    opacity = alt.condition(
                  select_legend,
                  alt.value(1),               # full opacity when selected
                  alt.value(0.2)              # dim when not
              ),
    tooltip=["Player:N","mean(GAA):Q","Team:N"]
).add_selection(
    select_legend
).interactive()

#making a scatter plot of SV% and GAA by player
chart5= alt.Chart(df_top10).mark_circle().encode(
    x       = alt.X("minutes:Q"),
    y       = alt.Y("SV%:Q", scale=alt.Scale(domain=[0.6, 1.0])),
    color   = alt.Color("Player:N"),         # color by player
    opacity = alt.condition(
                  select_legend,
                  alt.value(1),               # full opacity when selected
                  alt.value(0.2)              # dim when not
              ),
    tooltip = ["Player:N", "GAA:Q", "SV%:Q","Team:N","Date:T"]
).add_selection(
    select_legend
).interactive()

chart5 = chart5.properties(title="Minutes Played vs SV%")

#make timeseries of sv% comparing players

# aggregate (if you have multiple entries per date)
df_trends2 = (
    df_top10
    .groupby(['Player','Team','Date'], as_index=False)
    .agg(SVV=('SV%','mean'))
)
chart6= alt.Chart(df_trends2).mark_line(point=True).encode(
    x=alt.X('Date:T',title='Date'),
    y=alt.Y('SVV:Q',title='Save %', scale=alt.Scale(domain=[0.7, 1.025])),
    color='Player:N',
    opacity = alt.condition(
                  select_legend,
                  alt.value(1),               # full opacity when selected
                  alt.value(0.2)              # dim when not
              ),
     tooltip = ["Player:N", "SVV:Q","Team:N", "Date:T"]
).properties(
    title="Seasonal SV% Trends by Goalie"
).add_selection(
    select_legend
).interactive()
chart6

# titling charts1&2:
chart1 = chart1.properties(title="GAA vs SV% Scatter")
chart2 = chart2.properties(title="Avg Save %")
chart4 = chart4.properties(title="AVG GAA %")
#showing charts side by side:
combined = alt.hconcat(chart1, chart5).resolve_scale(color='independent')

#showing charts vertically:
combined2 = alt.hconcat(chart3, chart6).resolve_scale(color='independent')
combined3 = alt.hconcat(chart2, chart4).resolve_scale(color='independent')
stack = alt.vconcat(combined, combined2, combined3).resolve_scale(color='independent')

stack.properties(title="Click on any legend to filter all visuals by player")

	Player	Date	Age	Team	Loc	Opp	Result	DEC	MIN	GA	SV	Shots	SV%	GAA
0	Adin Hill	1/17/2025	28-251	VEG	@	CAR	L 2-3	L	57:08:00	3	25	28	0.893	3.15
1	Adin Hill	1/12/2025	28-246	VEG	NaN	MIN	W 4-1	W	59:46:00	1	15	16	0.938	1.00
2	Adin Hill	1/9/2025	28-243	VEG	NaN	NYI	L 0-4	L	58:30:00	3	17	20	0.850	3.08

	Player	Date	Age	Team	Loc	Opp	Result	DEC	MIN	GA	SV	Shots	SV%	GAA
0	Adin Hill	2025-01-17	28-251	VEG	away	CAR	L 2-3	L	57:08:00	3	25	28	0.893	3.15
1	Adin Hill	2025-01-12	28-246	VEG	home	MIN	W 4-1	W	59:46:00	1	15	16	0.938	1.00
2	Adin Hill	2025-01-09	28-243	VEG	home	NYI	L 0-4	L	58:30:00	3	17	20	0.850	3.08

	SV%	GAA
Player
Connor Hellebuyck	0.928944	1.965556
Logan Thompson	0.923538	2.118846
Darcy Kuemper	0.921087	2.102174
Dustin Wolf	0.916960	2.510000
Joey Daccord	0.916172	2.400690
Jacob Markström	0.912941	2.198235
Linus Ullmark	0.910870	2.312174
Filip Gustavsson	0.910613	2.781290
Andrei Vasilevskiy	0.907114	2.453429
Joseph Woll	0.906250	2.671667

Final Project for Fundamentals of Data Visualization¶

Project Outline:¶

Dataset Recap: NHL Goalies¶

Goals¶

Tasks¶

Note¶

Visualization Implementation¶

Visualization Summary¶

Justifications¶

Final Evaluation¶

Briefing:¶

Metrics Evaluated:¶

Results:¶

Synthesis & Next Steps¶

What Worked¶

What to Refine¶

Data Cleaning Prep for Visualization Implementation¶

(graders can ignore)¶