Skip to content

ScraperNHL Documentation

Data Export

maxtixador/scrapernhl

Data Export Examples¶

Learn how to save scraped data to various file formats.

Setup¶

import pandas as pd
import os
from datetime import datetime
from scrapernhl import HockeyScraper

# Create output directory
os.makedirs('output', exist_ok=True)

Export to CSV¶

nhl = HockeyScraper('nhl')

# Scrape teams
teams = nhl.scrape_teams()
teams.to_csv('output/nhl_teams.csv', index=False)
print(f"Saved {len(teams)} teams to output/nhl_teams.csv")

# Scrape schedule
schedule = nhl.schedule(team='MTL', season=20252026)
schedule.to_csv('output/mtl_schedule.csv', index=False)
print(f"Saved {len(schedule)} games to output/mtl_schedule.csv")

Export to Excel (Multiple Sheets)¶

nhl = HockeyScraper('nhl')

standings = nhl.standings()
skaters   = nhl.team_stats(team='MTL', season=20252026, goalies=False)
goalies   = nhl.team_stats(team='MTL', season=20252026, goalies=True)

with pd.ExcelWriter('output/nhl_data.xlsx', engine='openpyxl') as writer:
    standings.to_excel(writer, sheet_name='Standings', index=False)
    skaters.to_excel(writer, sheet_name='Skaters', index=False)
    goalies.to_excel(writer, sheet_name='Goalies', index=False)

print("Saved Excel file with 3 sheets to output/nhl_data.xlsx")

Export to JSON¶

nhl = HockeyScraper('nhl')

pbp = nhl.play_by_play(2024020001)

# Pretty printed
pbp.to_json('output/game_pbp.json', orient='records', indent=2)
print(f"Saved {len(pbp)} events to output/game_pbp.json")

# JSON lines (more efficient for large files)
pbp.to_json('output/game_pbp.jsonl', orient='records', lines=True)

# Compare file sizes
json_size  = os.path.getsize('output/game_pbp.json')  / 1024
jsonl_size = os.path.getsize('output/game_pbp.jsonl') / 1024
print(f"File sizes: JSON={json_size:.1f}KB, JSONL={jsonl_size:.1f}KB")

Export to Parquet (Recommended for Large Datasets)¶

nhl = HockeyScraper('nhl')

game_ids = [2024020001, 2024020002, 2024020003]
combined = nhl.scrape_multiple_games(game_ids)

# Save as Parquet (compressed)
combined.to_parquet('output/games_pbp.parquet', index=False, compression='snappy')
print(f"Saved {len(combined)} events to output/games_pbp.parquet")

# Verify round-trip
loaded = pd.read_parquet('output/games_pbp.parquet')
print(f"Verified: Loaded {len(loaded)} events")

# Compare size with CSV
combined.to_csv('output/games_pbp.csv', index=False)
parquet_size = os.path.getsize('output/games_pbp.parquet') / 1024
csv_size     = os.path.getsize('output/games_pbp.csv') / 1024
print(f"Parquet={parquet_size:.1f}KB, CSV={csv_size:.1f}KB  "
      f"({100 * (1 - parquet_size/csv_size):.0f}% smaller)")

Export to SQLite Database¶

import sqlite3
from scrapernhl import HockeyScraper

nhl = HockeyScraper('nhl')

teams     = nhl.scrape_teams()
standings = nhl.standings()
schedule  = nhl.schedule(team='MTL', season=20252026)

conn = sqlite3.connect('output/nhl_data.db')
teams.to_sql('teams',     conn, if_exists='replace', index=False)
standings.to_sql('standings', conn, if_exists='replace', index=False)
schedule.to_sql('schedule',  conn, if_exists='replace', index=False)
conn.close()

print("Saved to SQLite: output/nhl_data.db")
print(f"  teams={len(teams)}, standings={len(standings)}, schedule={len(schedule)}")

Incremental Append¶

nhl = HockeyScraper('nhl')

output_file = 'output/incremental_games.csv'
game_ids    = [2024020001, 2024020002, 2024020003]

for gid in game_ids:
    pbp = nhl.play_by_play(gid)
    if os.path.exists(output_file):
        pbp.to_csv(output_file, mode='a', header=False, index=False)
    else:
        pbp.to_csv(output_file, mode='w', header=True, index=False)
    print(f"  Game {gid}: {len(pbp)} events appended")

total = len(pd.read_csv(output_file))
print(f"Total events: {total}")

Export Selected Columns¶

nhl = HockeyScraper('nhl')

roster = nhl.roster(team='MTL', season=20252026)

cols = ['firstName.default', 'lastName.default', 'sweaterNumber',
        'positionCode', 'heightInInches', 'weightInPounds']
roster[cols].to_csv('output/mtl_roster_simple.csv', index=False)
print(f"Saved simplified roster to output/mtl_roster_simple.csv")

Non-NHL League Export¶

from scrapernhl import HockeyScraper

ahl = HockeyScraper('ahl')

# Export AHL standings for the current season
standings = ahl.standings(season=90)
standings.to_csv('output/ahl_standings.csv', index=False)

# Export AHL skater stats
skaters = ahl.player_stats(season=90, position='skaters')
skaters.to_parquet('output/ahl_skaters.parquet', index=False)

print(f"AHL standings: {len(standings)} teams")
print(f"AHL skaters:   {len(skaters)} players")

List Exported Files¶

output_files = sorted(os.listdir('output'))
print(f"Exported {len(output_files)} files:\n")
for f in output_files:
    size = os.path.getsize(os.path.join('output', f)) / 1024
    print(f"  {f:<40} {size:>8.1f} KB")

See Also¶

Basic Scraping - Getting the data
Advanced Examples - Analytics pipeline
API Reference - Function documentation