Reading and Writing Data
Overview
pandas can read from and write to many file formats. The most common pattern is pd.read_*()
for reading and df.to_*() for writing.
Common formats:
- CSV:
read_csv(),to_csv() - Excel:
read_excel(),to_excel() - JSON:
read_json(),to_json() - SQL:
read_sql(),to_sql() - Parquet:
read_parquet(),to_parquet()
CSV Files
CSV is the most common format. It's plain text, human-readable, and universally supported.
Reading CSV
Basic usage:
import pandas as pd
df = pd.read_csv('data.csv')
Common parameters:
# Custom delimiter
df = pd.read_csv('data.tsv', sep='\t') # Tab-separated
df = pd.read_csv('data.txt', sep='|') # Pipe-separated
# Specify column names if file has no header
df = pd.read_csv('data.csv', header=None)
df = pd.read_csv('data.csv', names=['A', 'B', 'C'])
# Skip rows
df = pd.read_csv('data.csv', skiprows=3) # Skip first 3 rows
df = pd.read_csv('data.csv', skiprows=[0, 2, 5]) # Skip specific rows
# Use specific column as index
df = pd.read_csv('data.csv', index_col=0) # First column
df = pd.read_csv('data.csv', index_col='id') # Column named 'id'
# Select specific columns
df = pd.read_csv('data.csv', usecols=['name', 'age'])
df = pd.read_csv('data.csv', usecols=[0, 1, 3]) # By position
Data types:
By default, pandas infers types, but you can be explicit:
# Specify dtypes
df = pd.read_csv('data.csv', dtype={
'age': int,
'price': float,
'category': 'category' # Saves memory for repeated values
})
# Parse dates automatically
df = pd.read_csv('data.csv', parse_dates=['date_column'])
# Parse dates from multiple columns
df = pd.read_csv('data.csv', parse_dates=[['year', 'month', 'day']])
Missing values:
# Recognize custom NA values
df = pd.read_csv('data.csv', na_values=['NA', 'missing', '?'])
# Different NA values per column
df = pd.read_csv('data.csv', na_values={
'age': ['?', 'unknown'],
'price': [0, -999]
})
# Keep default NA values
df = pd.read_csv('data.csv', keep_default_na=True)
Large files:
For files that don't fit in memory:
# Read in chunks
chunk_size = 10000
chunks = []
for chunk in pd.read_csv('large.csv', chunksize=chunk_size):
# Process each chunk
processed = chunk[chunk['value'] > 0]
chunks.append(processed)
df = pd.concat(chunks, ignore_index=True)
# Or read only n rows
df = pd.read_csv('large.csv', nrows=1000) # First 1000 rows
Encoding:
If you see strange characters, check the encoding:
# Common encodings
df = pd.read_csv('data.csv', encoding='utf-8') # Default
df = pd.read_csv('data.csv', encoding='latin-1') # Western European
df = pd.read_csv('data.csv', encoding='cp1252') # Windows
# Let pandas guess
df = pd.read_csv('data.csv', encoding_errors='ignore')
Writing CSV
# Basic write
df.to_csv('output.csv')
# Without row index
df.to_csv('output.csv', index=False)
# Custom separator
df.to_csv('output.tsv', sep='\t')
# Select columns
df.to_csv('output.csv', columns=['name', 'age'])
# Custom NA representation
df.to_csv('output.csv', na_rep='NULL')
# Append to existing file
df.to_csv('output.csv', mode='a', header=False)
# Compression
df.to_csv('output.csv.gz', compression='gzip')
df.to_csv('output.csv.zip', compression='zip')
Excel Files
Excel files (.xlsx, .xls) can contain multiple sheets and formatting. Reading Excel requires the
openpyxl or xlrd library.
# Install if needed
pip install openpyxl
Reading Excel
# Read first sheet
df = pd.read_excel('data.xlsx')
# Specific sheet by name
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# Specific sheet by position
df = pd.read_excel('data.xlsx', sheet_name=0) # First sheet
# Read all sheets into dictionary
dfs = pd.read_excel('data.xlsx', sheet_name=None)
# dfs = {'Sheet1': df1, 'Sheet2': df2, ...}
# Multiple specific sheets
dfs = pd.read_excel('data.xlsx', sheet_name=['Sheet1', 'Sheet3'])
Other parameters work like CSV:
df = pd.read_excel(
'data.xlsx',
sheet_name='Data',
skiprows=2, # Skip first 2 rows
usecols='A:D', # Columns A through D
nrows=100, # Read only 100 rows
dtype={'age': int},
na_values=['N/A']
)
# Column range by position
df = pd.read_excel('data.xlsx', usecols=[0, 1, 2])
# Named ranges (if defined in Excel)
df = pd.read_excel('data.xlsx', usecols='SalesData')
Writing Excel
# Basic write
df.to_excel('output.xlsx', index=False)
# Specific sheet name
df.to_excel('output.xlsx', sheet_name='MyData', index=False)
# Multiple sheets
with pd.ExcelWriter('output.xlsx') as writer:
df1.to_excel(writer, sheet_name='Sales', index=False)
df2.to_excel(writer, sheet_name='Customers', index=False)
df3.to_excel(writer, sheet_name='Products', index=False)
# Append to existing file (requires openpyxl)
with pd.ExcelWriter('existing.xlsx', mode='a') as writer:
df.to_excel(writer, sheet_name='NewSheet')
Formatting (requires openpyxl):
with pd.ExcelWriter('formatted.xlsx', engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Data', index=False)
# Access worksheet for formatting
worksheet = writer.sheets['Data']
worksheet.column_dimensions['A'].width = 20
JSON Files
JSON is common for web APIs and nested data structures.
Reading JSON
# From file
df = pd.read_json('data.json')
# From string
json_str = '{"name": ["Alice", "Bob"], "age": [25, 30]}'
df = pd.read_json(json_str)
# Orientation matters
# records: [{col: val}, {col: val}]
df = pd.read_json('data.json', orient='records')
# columns: {col: {index: val}}
df = pd.read_json('data.json', orient='columns')
# index: {index: {col: val}}
df = pd.read_json('data.json', orient='index')
Common JSON structures:
# Array of objects (most common from APIs)
# [{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]
df = pd.read_json('data.json', orient='records')
# Nested JSON requires normalization
import json
with open('nested.json') as f:
data = json.load(f)
df = pd.json_normalize(data)
# Example: API response with nested data
# {"users": [{"name": "Alice", "address": {"city": "NYC"}}]}
df = pd.json_normalize(data['users'])
# Columns: name, address.city
Writing JSON
# Default orientation
df.to_json('output.json')
# Records format (array of objects)
df.to_json('output.json', orient='records')
# Pretty print with indentation
df.to_json('output.json', orient='records', indent=2)
# Specific date format
df.to_json('output.json', date_format='iso')
# Split format (useful for archiving)
df.to_json('output.json', orient='split')
SQL Databases
pandas can read from and write to SQL databases. Requires sqlalchemy or database-specific driver.
pip install sqlalchemy psycopg2-binary # For PostgreSQL
pip install sqlalchemy pymysql # For MySQL
Reading from SQL
from sqlalchemy import create_engine
# Create connection
engine = create_engine('postgresql://user:password@localhost:5432/dbname')
# or: sqlite:///path/to/database.db
# or: mysql+pymysql://user:password@localhost/dbname
# Read entire table
df = pd.read_sql_table('table_name', engine)
# Execute query
query = "SELECT * FROM users WHERE age > 25"
df = pd.read_sql_query(query, engine)
# Auto-detect (table or query)
df = pd.read_sql("SELECT * FROM users", engine)
# With parameters (prevents SQL injection)
query = "SELECT * FROM users WHERE age > %(min_age)s"
df = pd.read_sql_query(query, engine, params={'min_age': 25})
# Parse dates
df = pd.read_sql_query(
"SELECT * FROM events",
engine,
parse_dates=['created_at', 'updated_at']
)
Reading in chunks:
# For large tables
for chunk in pd.read_sql_query(query, engine, chunksize=10000):
# Process each chunk
process(chunk)
Writing to SQL
# Write DataFrame to SQL
df.to_sql('table_name', engine, if_exists='replace', index=False)
# if_exists options:
# 'fail': Raise error if table exists (default)
# 'replace': Drop table and recreate
# 'append': Add data to existing table
# Append data
df.to_sql('logs', engine, if_exists='append', index=False)
# Specify data types
from sqlalchemy.types import Integer, String, Float
df.to_sql('table_name', engine, dtype={
'id': Integer,
'name': String(50),
'price': Float
})
# Write in chunks (for large DataFrames)
df.to_sql('table_name', engine, chunksize=1000)
Parquet Files
Parquet is a columnar storage format that's fast and efficient. Excellent for large datasets.
pip install pyarrow
# or:
pip install fastparquet
Why Parquet?
- Fast: Much faster than CSV for reading/writing
- Compact: Compressed by default, smaller file sizes
- Type-safe: Preserves data types exactly
- Columnar: Efficient for selecting specific columns
# Write parquet
df.to_parquet('data.parquet')
# With specific engine
df.to_parquet('data.parquet', engine='pyarrow')
# Compression options
df.to_parquet('data.parquet', compression='gzip')
df.to_parquet('data.parquet', compression='snappy') # Faster
df.to_parquet('data.parquet', compression='brotli') # Better compression
# Read parquet
df = pd.read_parquet('data.parquet')
# Read specific columns only
df = pd.read_parquet('data.parquet', columns=['name', 'age'])
# Read with filters (requires pyarrow)
df = pd.read_parquet(
'data.parquet',
filters=[('age', '>', 25)]
)
When to use Parquet:
- Storing intermediate analysis results
- Archiving processed data
- Sharing data between Python and other tools (Spark, R)
- Need to preserve exact data types
Other Formats
HTML Tables
# Read tables from HTML
dfs = pd.read_html('https://example.com/data.html') # Returns list of DataFrames
df = dfs[0] # First table
# Read from local file
dfs = pd.read_html('file.html')
# Write HTML table
df.to_html('output.html', index=False)
Clipboard
Useful for quick copy-paste:
# Copy from clipboard (e.g., Excel selection)
df = pd.read_clipboard()
# Copy to clipboard
df.to_clipboard(index=False)
Pickle (Python-specific)
Preserves pandas objects exactly, but only for Python:
# Save
df.to_pickle('data.pkl')
# Load
df = pd.read_pickle('data.pkl')
Warning: Pickle files can execute arbitrary code. Only use with trusted sources.
Feather
Fast binary format, interoperable with R and other languages:
pip install pyarrow
df.to_feather('data.feather')
df = pd.read_feather('data.feather')
Best Practices
Choosing a format:
- CSV: Human-readable, universal compatibility, simple data
- Excel: Sharing with non-programmers, multiple sheets, small datasets
- Parquet: Large datasets, preserving types, archiving, performance
- JSON: Web APIs, nested structures, configuration
- SQL: Centralized data, complex queries, multi-user access
Performance tips:
# For large CSVs, specify dtypes upfront
df = pd.read_csv('large.csv', dtype={
'id': 'int32', # Instead of int64
'category': 'category', # Instead of object
'price': 'float32' # Instead of float64
})
# Use chunksize for files larger than RAM
for chunk in pd.read_csv('huge.csv', chunksize=50000):
process(chunk)
# Use Parquet for repeated reads
df = pd.read_csv('data.csv')
df.to_parquet('data.parquet') # One-time conversion
df = pd.read_parquet('data.parquet') # Much faster subsequent reads
Data integrity:
# Always check data after reading
df.info() # Types, non-null counts
df.head() # First few rows
df.describe() # Statistics
df.isnull().sum() # Missing values per column
# Verify critical columns
assert df['id'].is_unique
assert df['price'].min() >= 0
Memory management:
# Check memory usage
df.memory_usage(deep=True).sum() / 1024 ** 2 # MB
# Optimize dtypes after reading
df['category'] = df['category'].astype('category')
df['year'] = df['year'].astype('int16') # If values fit
# Read only needed columns
df = pd.read_csv('data.csv', usecols=['id', 'name', 'price'])
Quick Reference
Reading:
pd.read_csv('file.csv')
pd.read_excel('file.xlsx', sheet_name='Sheet1')
pd.read_json('file.json', orient='records')
pd.read_sql('SELECT * FROM table', engine)
pd.read_parquet('file.parquet')
Writing:
df.to_csv('file.csv', index=False)
df.to_excel('file.xlsx', sheet_name='Data', index=False)
df.to_json('file.json', orient='records')
df.to_sql('table', engine, if_exists='replace')
df.to_parquet('file.parquet')
Common parameters:
index = False # Don't write index
usecols = ['A', 'B'] # Read specific columns
dtype = {'col': int} # Specify types
parse_dates = ['date'] # Parse as datetime
na_values = ['?', 'NA'] # Recognize missing values
chunksize = 10000 # Read in chunks