Pivot Tables and Melt
Overview
Pivot tables and melt are powerful reshaping tools:
- pivot(): Reshape long to wide (unique index/column combinations)
- pivot_table(): Pivot with aggregation (handles duplicates)
- melt(): Reshape wide to long (unpivot)
- crosstab(): Create cross-tabulations
pivot() - Basic Reshaping
Convert rows into columns when data has unique combinations:
Basic pivot operation
import pandas as pd
df = pd.DataFrame({
'date': ['2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02'],
'product': ['A', 'B', 'A', 'B'],
'sales': [100, 150, 120, 160]
})
# Pivot: dates as rows, products as columns
result = df.pivot(index='date', columns='product', values='sales')
# product A B
# date
# 2024-01-01 100 150
# 2024-01-02 120 160
# Reset index to make date a regular column
result = result.reset_index()
# product date A B
# 0 2024-01-01 100 150
# 1 2024-01-02 120 160
warning
pivot() requires unique combinations of index and columns. For duplicates, use pivot_table() instead.
Multiple Values
Pivot multiple value columns
df = pd.DataFrame({
'date': ['2024-01-01', '2024-01-02'],
'product': ['A', 'A'],
'sales': [100, 120],
'profit': [20, 25]
})
# Pivot both sales and profit
result = df.pivot(index='date', columns='product', values=['sales', 'profit'])
# sales profit
# product A A
# date
# 2024-01-01 100 20
# 2024-01-02 120 25
# Flatten multi-level columns
result.columns = ['_'.join(col) for col in result.columns]
result = result.reset_index()
# date sales_A profit_A
# 0 2024-01-01 100 20
# 1 2024-01-02 120 25
pivot_table() - Aggregating Pivot
Pivot with aggregation for handling duplicate combinations:
Pivot table with aggregation
df = pd.DataFrame({
'date': ['2024-01-01', '2024-01-01', '2024-01-01', '2024-01-02'],
'product': ['A', 'B', 'A', 'B'], # 'A' appears twice on 2024-01-01
'sales': [100, 150, 110, 160]
})
# pivot() would fail due to duplicate (2024-01-01, A)
# Use pivot_table with aggregation
result = pd.pivot_table(
df,
index='date',
columns='product',
values='sales',
aggfunc='sum' # How to handle duplicates
)
# product A B
# date
# 2024-01-01 210 150 # Sum of 100 + 110
# 2024-01-02 NaN 160
Aggregation Functions
Different aggregation methods
df = pd.DataFrame({
'category': ['A', 'A', 'B', 'B', 'A', 'B'],
'type': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
'value': [10, 20, 30, 40, 15, 35]
})
# Sum
result = pd.pivot_table(df, index='category', columns='type',
values='value', aggfunc='sum')
# type X Y
# category
# A 25 20
# B 30 75
# Mean
result = pd.pivot_table(df, index='category', columns='type',
values='value', aggfunc='mean')
# Count
result = pd.pivot_table(df, index='category', columns='type',
values='value', aggfunc='count')
# Multiple aggregations
result = pd.pivot_table(df, index='category', columns='type',
values='value', aggfunc=['sum', 'mean', 'count'])
Fill Missing Values
Handle missing values in pivot
df = pd.DataFrame({
'A': ['foo', 'foo', 'bar'],
'B': ['one', 'two', 'one'],
'C': [1, 2, 3]
})
# Missing combinations become NaN
result = pd.pivot_table(df, index='A', columns='B', values='C')
# B one two
# A
# bar 3.0 NaN
# foo 1.0 2.0
# Fill NaN with 0
result = pd.pivot_table(df, index='A', columns='B', values='C', fill_value=0)
# B one two
# A
# bar 3 0
# foo 1 2
Margins (Subtotals)
Add row and column totals
df = pd.DataFrame({
'category': ['A', 'A', 'B', 'B'],
'region': ['East', 'West', 'East', 'West'],
'sales': [100, 150, 200, 250]
})
result = pd.pivot_table(
df,
index='category',
columns='region',
values='sales',
aggfunc='sum',
margins=True, # Add totals
margins_name='Total' # Label for totals row/column
)
# region East West Total
# category
# A 100 150 250
# B 200 250 450
# Total 300 400 700
info
Use margins=True to add subtotals and grand totals to your pivot table, similar to Excel pivot tables.
Multiple Index/Columns
Hierarchical pivot table
df = pd.DataFrame({
'year': [2023, 2023, 2024, 2024],
'quarter': ['Q1', 'Q2', 'Q1', 'Q2'],
'region': ['East', 'West', 'East', 'West'],
'sales': [100, 150, 120, 160]
})
# Multiple row indices
result = pd.pivot_table(
df,
index=['year', 'quarter'],
columns='region',
values='sales'
)
# region East West
# year quarter
# 2023 Q1 100 NaN
# Q2 NaN 150
# 2024 Q1 120 NaN
# Q2 NaN 160
melt() - Wide to Long
Unpivot data from wide to long format:
Basic melt operation
df = pd.DataFrame({
'name': ['Alice', 'Bob'],
'math': [90, 85],
'english': [88, 92],
'science': [95, 87]
})
# name math english science
# 0 Alice 90 88 95
# 1 Bob 85 92 87
# Melt all columns except name
melted = df.melt(id_vars=['name'])
# name variable value
# 0 Alice math 90
# 1 Bob math 85
# 2 Alice english 88
# 3 Bob english 92
# 4 Alice science 95
# 5 Bob science 87
Custom Column Names
Rename melted columns
melted = df.melt(
id_vars=['name'],
var_name='subject', # Name for variable column
value_name='score' # Name for value column
)
# name subject score
# 0 Alice math 90
# 1 Bob math 85
# 2 Alice english 88
# 3 Bob english 92
# 4 Alice science 95
# 5 Bob science 87
Selective Melting
Melt specific columns only
df = pd.DataFrame({
'student': ['Alice', 'Bob'],
'class': ['A', 'B'],
'math': [90, 85],
'english': [88, 92],
'science': [95, 87]
})
# Melt only math and english
melted = df.melt(
id_vars=['student', 'class'],
value_vars=['math', 'english'], # Only these columns
var_name='subject',
value_name='score'
)
# student class subject score
# 0 Alice A math 90
# 1 Bob B math 85
# 2 Alice A english 88
# 3 Bob B english 92
Pivot and Melt Together
Round Trip Transformation
Pivot then melt back
# Start with long format
long = pd.DataFrame({
'name': ['Alice', 'Alice', 'Bob', 'Bob'],
'subject': ['math', 'english', 'math', 'english'],
'score': [90, 88, 85, 92]
})
# Convert to wide
wide = long.pivot(index='name', columns='subject', values='score')
# subject english math
# name
# Alice 88 90
# Bob 92 85
# Convert back to long
long_again = wide.reset_index().melt(
id_vars=['name'],
var_name='subject',
value_name='score'
)
# name subject score
# 0 Alice english 88
# 1 Bob english 92
# 2 Alice math 90
# 3 Bob math 85
crosstab() - Frequency Tables
Create cross-tabulation of two or more factors:
Basic crosstab
df = pd.DataFrame({
'gender': ['M', 'F', 'M', 'F', 'M', 'F'],
'handed': ['R', 'R', 'L', 'R', 'R', 'L'],
'age': [25, 30, 28, 35, 22, 27]
})
# Count combinations
result = pd.crosstab(df['gender'], df['handed'])
# handed L R
# gender
# F 1 2
# M 1 2
# With margins (totals)
result = pd.crosstab(df['gender'], df['handed'], margins=True)
# handed L R All
# gender
# F 1 2 3
# M 1 2 3
# All 2 4 6
Crosstab with Values
Crosstab with aggregation
df = pd.DataFrame({
'category': ['A', 'A', 'B', 'B', 'A', 'B'],
'region': ['East', 'West', 'East', 'West', 'East', 'West'],
'sales': [100, 150, 200, 250, 120, 280]
})
# Sum sales by category and region
result = pd.crosstab(
df['category'],
df['region'],
values=df['sales'],
aggfunc='sum'
)
# region East West
# category
# A 220 150
# B 200 530
# Normalize to show percentages
result = pd.crosstab(
df['category'],
df['region'],
normalize='all' # 'all', 'index', or 'columns'
)
# region East West
# category
# A 0.333333 0.166667
# B 0.166667 0.333333
Common Patterns
Sales by Product and Month
Reshape sales data
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=60, freq='D'),
'product': ['A', 'B'] * 30,
'sales': range(60)
})
# Add month column
df['month'] = df['date'].dt.to_period('M')
# Pivot to see products by month
result = pd.pivot_table(
df,
index='month',
columns='product',
values='sales',
aggfunc='sum'
)
# product A B
# month
# 2024-01 465 496
# 2024-02 899 928
Survey Data Transformation
Reshape survey responses
# Wide format survey
survey = pd.DataFrame({
'respondent': [1, 2, 3],
'q1_rating': [5, 4, 3],
'q2_rating': [4, 5, 4],
'q3_rating': [3, 4, 5]
})
# Convert to long format for analysis
long = survey.melt(
id_vars=['respondent'],
var_name='question',
value_name='rating'
)
# Clean question names
long['question'] = long['question'].str.replace('_rating', '')
# respondent question rating
# 0 1 q1 5
# 1 2 q1 4
# 2 3 q1 3
# 3 1 q2 4
# 4 2 q2 5
# 5 3 q2 4
# Analyze
long.groupby('question')['rating'].mean()
# question
# q1 4.0
# q2 4.333333
# q3 4.0
Time Series Pivoting
Pivot time series data
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=12, freq='M'),
'metric': ['revenue', 'cost'] * 6,
'value': [100, 80, 110, 85, 120, 90, 115, 88, 125, 92, 130, 95]
})
# Pivot to have metrics as columns
result = df.pivot(index='date', columns='metric', values='value')
result['profit'] = result['revenue'] - result['cost']
# result:
# metric cost revenue profit
# date
# 2024-01-31 80 100 20
# 2024-02-29 85 110 25
# 2024-03-31 90 120 30
Advanced Techniques
Multi-Level Melt
Melt with multiple value columns
df = pd.DataFrame({
'student': ['Alice', 'Bob'],
'math_score': [90, 85],
'math_grade': ['A', 'B'],
'english_score': [88, 92],
'english_grade': ['B', 'A']
})
# Melt keeping score and grade together
# First melt scores
scores = df.melt(
id_vars=['student'],
value_vars=['math_score', 'english_score'],
var_name='subject_score',
value_name='score'
)
scores['subject'] = scores['subject_score'].str.replace('_score', '')
# Then melt grades
grades = df.melt(
id_vars=['student'],
value_vars=['math_grade', 'english_grade'],
var_name='subject_grade',
value_name='grade'
)
grades['subject'] = grades['subject_grade'].str.replace('_grade', '')
# Merge back together
result = pd.merge(scores[['student', 'subject', 'score']],
grades[['student', 'subject', 'grade']],
on=['student', 'subject'])
Conditional Pivot
Pivot with conditional values
df = pd.DataFrame({
'category': ['A', 'A', 'B', 'B'],
'metric': ['sales', 'profit', 'sales', 'profit'],
'value': [100, 20, 200, 40]
})
# Pivot
result = df.pivot(index='category', columns='metric', values='value')
# Calculate profit margin
result['margin'] = (result['profit'] / result['sales'] * 100).round(2)
# metric profit sales margin
# category
# A 20 100 20.0
# B 40 200 20.0
Performance Tips
Optimize pivot operations
# For large DataFrames, use categorical for repeated values
df['category'] = df['category'].astype('category')
df['region'] = df['region'].astype('category')
# Then pivot
result = pd.pivot_table(df, index='category', columns='region', values='sales')
# Use observed=True in groupby before pivot
# (Faster with categorical data)
success
Convert columns to categorical before pivoting if they have many repeated values. This significantly improves performance and reduces memory usage.
Common Mistakes
Pivot with Duplicates
Handle duplicate index-column combinations
df = pd.DataFrame({
'A': ['foo', 'foo', 'foo'],
'B': ['one', 'one', 'two'],
'C': [1, 2, 3]
})
# This fails - duplicate (foo, one)
# df.pivot(index='A', columns='B', values='C') # ValueError!
# Use pivot_table instead
result = pd.pivot_table(df, index='A', columns='B', values='C', aggfunc='sum')
# B one two
# A
# foo 3 3
Forgetting reset_index()
Reset index after pivot
# Pivot creates index
result = df.pivot(index='date', columns='product', values='sales')
# product A B
# date
# 2024-01 100 150
# Index makes filtering harder
# result[result['date'] == '2024-01'] # Error! date is index
# Reset to make date a column
result = result.reset_index()
# Now can filter: result[result['date'] == '2024-01']
Column Name Confusion
Handle multi-level column names
df = pd.DataFrame({
'date': ['2024-01-01', '2024-01-02'],
'product': ['A', 'A'],
'sales': [100, 120],
'profit': [20, 25]
})
result = df.pivot(index='date', columns='product', values=['sales', 'profit'])
# Multi-level columns: ('sales', 'A'), ('profit', 'A')
# Flatten column names
result.columns = ['_'.join(col) for col in result.columns]
# Now: sales_A, profit_A
Quick Reference
Pivot (unique combinations):
df.pivot(index='row', columns='col', values='val')
df.pivot(index='row', columns='col', values=['val1', 'val2'])
Pivot table (with aggregation):
pd.pivot_table(df, index='row', columns='col', values='val', aggfunc='sum')
pd.pivot_table(df, index='row', columns='col', aggfunc=['sum', 'mean'])
pd.pivot_table(df, index='row', columns='col', fill_value=0)
pd.pivot_table(df, index='row', columns='col', margins=True)
Melt (wide to long):
df.melt(id_vars=['id'])
df.melt(id_vars=['id'], value_vars=['col1', 'col2'])
df.melt(id_vars=['id'], var_name='variable', value_name='value')
Crosstab:
pd.crosstab(df['row'], df['col'])
pd.crosstab(df['row'], df['col'], values=df['val'], aggfunc='sum')
pd.crosstab(df['row'], df['col'], normalize='all')
pd.crosstab(df['row'], df['col'], margins=True)
Common patterns:
# Pivot then calculate
wide = df.pivot(index='date', columns='product', values='sales')
wide['total'] = wide.sum(axis=1)
# Melt then aggregate
long = df.melt(id_vars=['id'])
long.groupby('variable')['value'].mean()
# Round trip
wide = long.pivot(index='id', columns='var', values='val')
long = wide.reset_index().melt(id_vars=['id'])
# Flatten multi-level columns
df.columns = ['_'.join(col) for col in df.columns]