Skip to content

ScraperNHL Documentation

Data Export

maxtixador/scrapernhl

Data Export Examples¶

Learn how to save scraped data to various file formats.

Setup¶

import pandas as pd
import os
from datetime import datetime

# Create output directory
os.makedirs('output', exist_ok=True)
print("Created output/ directory for exported files")

Export to CSV¶

from scrapernhl.scrapers.teams import scrapeTeams
from scrapernhl.scrapers.schedule import scrapeSchedule

# Scrape teams
teams = scrapeTeams()
teams.to_csv('output/nhl_teams.csv', index=False)
print(f"✅ Saved {len(teams)} teams to output/nhl_teams.csv")

# Scrape schedule
schedule = scrapeSchedule("MTL", "20252026")
schedule.to_csv('output/mtl_schedule.csv', index=False)
print(f"✅ Saved {len(schedule)} games to output/mtl_schedule.csv")

Export to Excel (Multiple Sheets)¶

from scrapernhl.scrapers.standings import scrapeStandings
from scrapernhl.scrapers.stats import scrapeTeamStats

# Scrape data
today = datetime.now().strftime("%Y-%m-%d")
standings = scrapeStandings(today)
skaters = scrapeTeamStats("MTL", "20252026", goalies=False)
goalies = scrapeTeamStats("MTL", "20252026", goalies=True)

# Save to Excel with multiple sheets
with pd.ExcelWriter('output/nhl_data.xlsx', engine='openpyxl') as writer:
    standings.to_excel(writer, sheet_name='Standings', index=False)
    skaters.to_excel(writer, sheet_name='Skaters', index=False)
    goalies.to_excel(writer, sheet_name='Goalies', index=False)

print(f"✅ Saved Excel file with 3 sheets to output/nhl_data.xlsx")
print(f"   - Standings: {len(standings)} teams")
print(f"   - Skaters: {len(skaters)} players")
print(f"   - Goalies: {len(goalies)} goalies")

Export to JSON¶

from scrapernhl.scrapers.games import scrapePlays

# Get a completed game
completed_games = schedule[schedule['gameState'] == 'OFF']
if len(completed_games) > 0:
    game_id = completed_games.iloc[0]['id']

    # Scrape play-by-play
    pbp = scrapePlays(game_id)

    # Save as JSON (pretty printed)
    pbp.to_json('output/game_pbp.json', orient='records', indent=2)
    print(f"✅ Saved {len(pbp)} events to output/game_pbp.json (pretty format)")

    # Save as JSON lines (more efficient for large files)
    pbp.to_json('output/game_pbp.jsonl', orient='records', lines=True)
    print(f"✅ Saved {len(pbp)} events to output/game_pbp.jsonl (lines format)")

    # Compare file sizes
    json_size = os.path.getsize('output/game_pbp.json') / 1024
    jsonl_size = os.path.getsize('output/game_pbp.jsonl') / 1024
    print(f"   File sizes: JSON={json_size:.1f}KB, JSONL={jsonl_size:.1f}KB")
else:
    print("No completed games found")

Export to Parquet (Recommended for Large Datasets)¶

# Scrape multiple games
game_ids = completed_games.head(3)['id'].tolist() if len(completed_games) >= 3 else []

if game_ids:
    all_pbp = []
    for gid in game_ids:
        print(f"Scraping game {gid}...")
        pbp = scrapePlays(gid)
        all_pbp.append(pbp)

    # Combine
    combined = pd.concat(all_pbp, ignore_index=True)

    # Save as Parquet (compressed)
    combined.to_parquet('output/games_pbp.parquet', index=False, compression='snappy')
    print(f"✅ Saved {len(combined)} events to output/games_pbp.parquet")

    # Read it back to verify
    df = pd.read_parquet('output/games_pbp.parquet')
    print(f"✅ Verified: Loaded {len(df)} events from parquet")

    # Compare with CSV
    combined.to_csv('output/games_pbp.csv', index=False)
    parquet_size = os.path.getsize('output/games_pbp.parquet') / 1024
    csv_size = os.path.getsize('output/games_pbp.csv') / 1024
    print(f"   File sizes: Parquet={parquet_size:.1f}KB, CSV={csv_size:.1f}KB")
    print(f"   Compression: {100 * (1 - parquet_size/csv_size):.1f}% smaller")
else:
    print("No completed games available")

Export with Polars (Faster for Large Datasets)¶

# Get data as Polars DataFrame
teams_pl = scrapeTeams(output_format="polars")

# Export to various formats
teams_pl.write_csv('output/teams_polars.csv')
teams_pl.write_json('output/teams_polars.json')
teams_pl.write_parquet('output/teams_polars.parquet')

print(f"✅ Exported {len(teams_pl)} teams using Polars to:")
print("   - output/teams_polars.csv")
print("   - output/teams_polars.json")
print("   - output/teams_polars.parquet")

Export to SQLite Database¶

import sqlite3

# Create database connection
conn = sqlite3.connect('output/nhl_data.db')

# Save multiple tables
teams.to_sql('teams', conn, if_exists='replace', index=False)
standings.to_sql('standings', conn, if_exists='replace', index=False)
schedule.to_sql('schedule', conn, if_exists='replace', index=False)

print("✅ Saved to SQLite database: output/nhl_data.db")
print(f"   Tables: teams ({len(teams)} rows), standings ({len(standings)} rows), schedule ({len(schedule)} rows)")

# Query the database
query = "SELECT fullName, id, teamPlaceName FROM teams LIMIT 5"
result = pd.read_sql(query, conn)
print("\n   Sample query result:")
result

conn.close()

Incremental Export (Append Mode)¶

output_file = 'output/incremental_games.csv'

# Remove if exists (for demo)
if os.path.exists(output_file):
    os.remove(output_file)

# Scrape games incrementally
for gid in game_ids[:2] if game_ids else []:
    print(f"Scraping game {gid}...")
    pbp = scrapePlays(gid)

    # Append to CSV (create if doesn't exist)
    if os.path.exists(output_file):
        pbp.to_csv(output_file, mode='a', header=False, index=False)
        print(f"   Appended {len(pbp)} events")
    else:
        pbp.to_csv(output_file, mode='w', header=True, index=False)
        print(f"   Created file with {len(pbp)} events")

if os.path.exists(output_file):
    total_rows = len(pd.read_csv(output_file))
    print(f"\n✅ Total events in incremental file: {total_rows}")

Export Selected Columns¶

from scrapernhl.scrapers.roster import scrapeRoster

# Scrape roster
roster = scrapeRoster("MTL", "20252026")

# Export only specific columns
columns_to_export = ['firstName.default', 'lastName.default', 'sweaterNumber', 'positionCode', 'heightInInches', 'weightInPounds']
roster[columns_to_export].to_csv('output/mtl_roster_simple.csv', index=False)
print(f"✅ Saved simplified roster ({len(columns_to_export)} columns) to output/mtl_roster_simple.csv")

Export with Custom Formatting¶

# Add custom columns and formatting
schedule_formatted = schedule.copy()
schedule_formatted['gameDate'] = pd.to_datetime(schedule_formatted['gameDate']).dt.strftime('%Y-%m-%d')
schedule_formatted['season'] = '2025-26'
schedule_formatted['scraped_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# Export with custom column order
column_order = ['gameDate', 'homeTeam.abbrev', 'awayTeam.abbrev', 'gameState', 'season', 'scraped_at']
schedule_formatted[column_order].to_csv('output/mtl_schedule_formatted.csv', index=False)
print("✅ Saved formatted schedule to output/mtl_schedule_formatted.csv")
print("   Custom columns: season, scraped_at")
print("   Date format: YYYY-MM-DD")

List All Exported Files¶

# List all files in output directory
output_files = os.listdir('output')
print(f"\n📁 Exported {len(output_files)} files to output/ directory:\n")

for file in sorted(output_files):
    filepath = os.path.join('output', file)
    size = os.path.getsize(filepath) / 1024
    print(f"   {file:<30} {size:>8.1f} KB")

See Also¶

Basic Scraping - Getting the data
Advanced Examples - Data processing
API Reference - Function documentation