Skip to main content

Missing Data

Overview

Missing data appears as NaN (Not a Number), None, or NaT (Not a Time) in pandas. Handling it correctly is crucial for data analysis.

Detecting Missing Data

Check for Missing Values

Find missing values
import pandas as pd
import numpy as np

df = pd.DataFrame({
'name': ['Alice', 'Bob', None, 'David'],
'age': [25, np.nan, 35, 28],
'city': ['NYC', 'LA', 'Chicago', None]
})

# Check which values are missing
df.isnull() # or df.isna()
# name age city
# 0 False False False
# 1 False True False
# 2 True False False
# 3 False False True

# Check which values are NOT missing
df.notnull() # or df.notna()

isnull() and isna() are identical. Use whichever you prefer.

Count Missing Values

Count missing values per column
# Missing values per column
df.isnull().sum()
# name 1
# age 1
# city 1

# Total missing values
df.isnull().sum().sum() # 3

# Percentage missing per column
(df.isnull().sum() / len(df) * 100).round(2)
# name 25.0
# age 25.0
# city 25.0

Find Rows with Missing Data

Get rows with missing values
# Rows with any missing value
df[df.isnull().any(axis=1)]
# name age city
# 1 Bob NaN LA
# 2 None 35.0 Chicago
# 3 David 28.0 None

# Rows with all values missing
df[df.isnull().all(axis=1)]

# Rows with specific column missing
df[df['age'].isnull()]
# name age city
# 1 Bob NaN LA

# Rows with multiple columns missing
df[df['name'].isnull() | df['city'].isnull()]

Summary of Missing Data

Missing data summary
# Create summary DataFrame
missing_summary = pd.DataFrame({
'missing_count': df.isnull().sum(),
'missing_pct': (df.isnull().sum() / len(df) * 100).round(2)
})
print(missing_summary)
# missing_count missing_pct
# name 1 25.00
# age 1 25.00
# city 1 25.00

Removing Missing Data

dropna() - Remove Rows/Columns

Drop rows with missing values
df = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [9, 10, 11, 12]
})

# Drop rows with any missing value (default)
df.dropna()
# A B C
# 0 1.0 5.0 9
# 3 4.0 8.0 12

# Original df unchanged unless inplace=True
df.dropna(inplace=True)

Drop Based on Thresholds

Drop with conditions
df = pd.DataFrame({
'A': [1, np.nan, np.nan, 4],
'B': [5, 6, np.nan, 8],
'C': [9, 10, 11, 12]
})

# Drop rows with ALL values missing
df.dropna(how='all')
# A B C
# 0 1.0 5.0 9
# 1 NaN 6.0 10
# 2 NaN NaN 11
# 3 4.0 8.0 12

# Drop rows with at least N non-null values
df.dropna(thresh=2) # Keep rows with at least 2 non-null
# A B C
# 0 1.0 5.0 9
# 1 NaN 6.0 10
# 3 4.0 8.0 12

Drop Columns with Missing Data

Drop columns instead of rows
# Drop columns with any missing value
df.dropna(axis=1)
# C
# 0 9
# 1 10
# 2 11
# 3 12

# Drop columns with all missing values
df.dropna(axis=1, how='all')

# Drop columns with too many missing values
threshold = len(df) * 0.5 # More than 50% missing
df.dropna(axis=1, thresh=threshold)

Drop Based on Specific Columns

Drop based on subset of columns
df = pd.DataFrame({
'id': [1, 2, 3, 4],
'value': [100, np.nan, 300, np.nan],
'optional': [np.nan, np.nan, np.nan, np.nan]
})

# Drop only if 'value' is missing (ignore 'optional')
df.dropna(subset=['value'])
# id value optional
# 0 1 100.0 NaN
# 2 3 300.0 NaN

# Drop if either id or value is missing
df.dropna(subset=['id', 'value'])

Filling Missing Data

fillna() - Replace Missing Values

Fill with constant value
df = pd.DataFrame({
'A': [1, np.nan, 3],
'B': [4, 5, np.nan]
})

# Fill all missing with 0
df.fillna(0)
# A B
# 0 1.0 4.0
# 1 0.0 5.0
# 2 3.0 0.0

# Fill with specific values per column
df.fillna({'A': 0, 'B': 999})
# A B
# 0 1.0 4.0
# 1 0.0 5.0
# 2 3.0 999.0

Fill with Statistics

Fill with mean, median, mode
df = pd.DataFrame({
'price': [100, 200, np.nan, 400, np.nan],
'quantity': [1, np.nan, 3, 4, 5]
})

# Fill with mean
df['price'].fillna(df['price'].mean())
# 0 100.0
# 1 200.0
# 2 233.33
# 3 400.0
# 4 233.33

# Fill with median (more robust to outliers)
df['price'].fillna(df['price'].median())

# Fill with mode (most common value)
df['quantity'].fillna(df['quantity'].mode()[0])

# Fill all numeric columns with mean
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

Forward Fill and Backward Fill

Fill with adjacent values
df = pd.DataFrame({
'value': [1, np.nan, np.nan, 4, np.nan]
})

# Forward fill (use previous value)
df['value'].fillna(method='ffill') # or method='pad'
# 0 1.0
# 1 1.0 # Filled from row 0
# 2 1.0 # Filled from row 1
# 3 4.0
# 4 4.0 # Filled from row 3

# Backward fill (use next value)
df['value'].fillna(method='bfill') # or method='backfill'
# 0 1.0
# 1 4.0 # Filled from row 3
# 2 4.0 # Filled from row 3
# 3 4.0
# 4 NaN # No value after this

# Limit number of consecutive fills
df['value'].fillna(method='ffill', limit=1)
# Only fills 1 consecutive NaN

Forward/backward fill is useful for time series data.

Fill with Interpolation

Interpolate missing values
df = pd.DataFrame({
'value': [1, np.nan, np.nan, 4, np.nan, 6]
})

# Linear interpolation
df['value'].interpolate()
# 0 1.0
# 1 2.0 # Interpolated
# 2 3.0 # Interpolated
# 3 4.0
# 4 5.0 # Interpolated
# 5 6.0

# Different interpolation methods
df['value'].interpolate(method='polynomial', order=2)
df['value'].interpolate(method='spline', order=2)

Interpolation estimates values based on surrounding data.

Replacing Values with NaN

Replace Specific Values

Convert values to NaN
df = pd.DataFrame({
'age': [25, -999, 30, 0, 35],
'income': [50000, 0, 60000, 'N/A', 70000]
})

# Replace specific value with NaN
df['age'].replace(-999, np.nan)
# 0 25.0
# 1 NaN
# 2 30.0

# Replace multiple values
df['age'].replace([0, -999], np.nan)

# Replace in entire DataFrame
df.replace('N/A', np.nan)

# Replace using dictionary
df.replace({'age': {0: np.nan}, 'income': {'N/A': np.nan}})

Common placeholders: -999, 0, 'N/A', 'NULL', '', '?'

Group-Based Filling

Fill with Group Statistics

Fill missing values by group
df = pd.DataFrame({
'category': ['A', 'A', 'B', 'B', 'A'],
'value': [10, np.nan, 20, np.nan, 30]
})

# Fill with group mean
df['value'] = df.groupby('category')['value'].transform(
lambda x: x.fillna(x.mean())
)
# category value
# 0 A 10.0
# 1 A 20.0 # Filled with mean of A (10+30)/2
# 2 B 20.0
# 3 B 20.0 # Filled with mean of B (20)
# 4 A 30.0

# Fill with group median
df['value'] = df.groupby('category')['value'].transform(
lambda x: x.fillna(x.median())
)

This preserves group-level patterns in the data.

Handling Special Cases

NaT (Not a Time) for Dates

Missing datetime values
df = pd.DataFrame({
'date': pd.to_datetime(['2024-01-01', None, '2024-01-03'])
})

# Check for missing dates
df['date'].isnull()
# 0 False
# 1 True
# 2 False

# Fill with specific date
df['date'].fillna(pd.to_datetime('2024-01-01'))

# Forward fill dates
df['date'].fillna(method='ffill')

Empty Strings vs NaN

Distinguish between empty and NaN
df = pd.DataFrame({
'text': ['hello', '', None, 'world']
})

# Empty string is not NaN
df['text'].isnull()
# 0 False
# 1 False # Empty string is not null
# 2 True
# 3 False

# Convert empty strings to NaN
df['text'].replace('', np.nan)

# Or during read
df = pd.read_csv('data.csv', na_values=['', 'NA', 'NULL'])

Checking for Missing After Operations

Verify Cleaning Results

Validate missing data handling
# Before cleaning
print(f"Missing before: {df.isnull().sum().sum()}")

# Clean data
df_clean = df.fillna(df.mean())

# After cleaning
print(f"Missing after: {df_clean.isnull().sum().sum()}")

# Ensure no missing values remain
assert df_clean.isnull().sum().sum() == 0, "Still have missing values!"

Common Patterns

Strategy: Drop vs Fill

Decision framework for handling missing data
df = pd.DataFrame({
'critical': [1, 2, np.nan, 4], # Can't proceed without this
'optional': [5, np.nan, np.nan, 8], # Can fill or ignore
'metadata': [np.nan] * 4 # All missing, can drop
})

# Drop rows where critical column is missing
df = df.dropna(subset=['critical'])

# Fill optional column
df['optional'] = df['optional'].fillna(df['optional'].median())

# Drop entirely missing columns
df = df.dropna(axis=1, how='all')

Time Series Missing Data

Handle missing in time series
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=5),
'value': [100, np.nan, np.nan, 400, 500]
})

# Forward fill (carry last observation forward)
df['value_ffill'] = df['value'].fillna(method='ffill')

# Interpolate (smooth transition)
df['value_interp'] = df['value'].interpolate()

# Both approaches common in time series

Conditional Filling

Fill based on conditions
df = pd.DataFrame({
'type': ['A', 'A', 'B', 'B'],
'value': [10, np.nan, np.nan, 40]
})

# Fill only for specific type
df.loc[df['type'] == 'A', 'value'] = df.loc[df['type'] == 'A', 'value'].fillna(0)

# Or use np.where
df['value'] = np.where(
df['value'].isnull() & (df['type'] == 'A'),
0,
df['value']
)

Performance Tips

Efficient Missing Value Operations

Performance considerations
# Slow: Check each column separately
for col in df.columns:
if df[col].isnull().any():
df[col].fillna(0, inplace=True)

# Fast: Vectorized operation
df.fillna(0, inplace=True)

# For large DataFrames, check if filling is needed first
if df.isnull().any().any():
df.fillna(0, inplace=True)

Common Mistakes

Comparing NaN

NaN comparison behavior
# NaN != NaN (always)
np.nan == np.nan # False!

# Use pandas methods to check
pd.isna(np.nan) # True
pd.isna(None) # True

# Don't do this:
# df[df['col'] == np.nan] # Returns empty!

# Do this:
df[df['col'].isna()] # Correct

Modifying Copies

Copy vs view warning
# This might not work
subset = df[df['A'] > 5]
subset['B'].fillna(0) # Warning!

# Do this instead
subset = df[df['A'] > 5].copy()
subset['B'].fillna(0, inplace=True)

# Or use loc
df.loc[df['A'] > 5, 'B'] = df.loc[df['A'] > 5, 'B'].fillna(0)

Forgetting inplace=False

fillna doesn't modify by default
df = pd.DataFrame({'A': [1, np.nan, 3]})

# This doesn't change df
df['A'].fillna(0) # Returns new Series

# Need to assign or use inplace
df['A'] = df['A'].fillna(0) # Assign result
# or
df['A'].fillna(0, inplace=True) # Modify in place

Quick Reference

Detect missing:

df.isnull()                    # Boolean mask
df.isnull().sum() # Count per column
df.isnull().any() # Any missing per column
df[df['col'].isnull()] # Rows where col is null

Remove missing:

df.dropna()                    # Drop rows with any NaN
df.dropna(how='all') # Drop if all values NaN
df.dropna(subset=['col']) # Drop based on specific column
df.dropna(axis=1) # Drop columns with NaN
df.dropna(thresh=2) # Keep rows with 2+ non-null

Fill missing:

df.fillna(0)                   # Fill with constant
df.fillna(df.mean()) # Fill with mean
df.fillna(method='ffill') # Forward fill
df.fillna(method='bfill') # Backward fill
df.interpolate() # Interpolate
df.fillna({'A': 0, 'B': 999}) # Different values per column

Replace with NaN:

df.replace(-999, np.nan)       # Replace specific value
df.replace(['NA', '?'], np.nan) # Replace multiple values

Common workflow:

# 1. Check missing data
df.isnull().sum()

# 2. Decide strategy per column
df.dropna(subset=['critical_col'])
df['optional_col'].fillna(df['optional_col'].median())

# 3. Verify
assert df.isnull().sum().sum() == 0