String Operations
Overview
pandas provides string methods through the .str accessor. These methods work on Series containing strings and are essential for text data cleaning.
Basic String Methods
Case Conversion
Change text case
import pandas as pd
df = pd.DataFrame({
'name': ['Alice Smith', 'BOB JONES', 'charlie Brown']
})
# Convert to lowercase
df['name'].str.lower()
# 0 alice smith
# 1 bob jones
# 2 charlie brown
# Convert to uppercase
df['name'].str.upper()
# 0 ALICE SMITH
# 1 BOB JONES
# 2 CHARLIE BROWN
# Title case (capitalize first letter of each word)
df['name'].str.title()
# 0 Alice Smith
# 1 Bob Jones
# 2 Charlie Brown
# Capitalize (only first letter)
df['name'].str.capitalize()
# 0 Alice smith
# 1 Bob jones
# 2 Charlie brown
Strip Whitespace
Remove leading/trailing whitespace
df = pd.DataFrame({
'text': [' hello ', 'world ', ' test']
})
# Remove both sides
df['text'].str.strip()
# 0 hello
# 1 world
# 2 test
# Remove left side only
df['text'].str.lstrip()
# 0 hello
# 1 world
# 2 test
# Remove right side only
df['text'].str.rstrip()
# 0 hello
# 1 world
# 2 test
Always strip whitespace when cleaning user input.
String Length
Get string length
df = pd.DataFrame({
'text': ['hello', 'world', 'test']
})
df['text'].str.len()
# 0 5
# 1 5
# 2 4
# Filter by length
df[df['text'].str.len() > 4]
# text
# 0 hello
# 1 world
Finding and Replacing
Contains
Check if string contains a pattern:
Check if strings contain pattern
df = pd.DataFrame({
'email': ['alice@gmail.com', 'bob@yahoo.com', 'charlie@gmail.com']
})
# Check if contains 'gmail'
df['email'].str.contains('gmail')
# 0 True
# 1 False
# 2 True
# Use in filtering
gmail_users = df[df['email'].str.contains('gmail')]
# email
# 0 alice@gmail.com
# 2 charlie@gmail.com
# Case-insensitive search
df['email'].str.contains('GMAIL', case=False)
# Check for missing values (na=False to handle NaN)
df['email'].str.contains('gmail', na=False)
Starts With / Ends With
Check string start or end
df = pd.DataFrame({
'filename': ['report.pdf', 'data.csv', 'image.png', 'notes.txt']
})
# Files ending with .pdf
df['filename'].str.endswith('.pdf')
# 0 True
# 1 False
# 2 False
# 3 False
# Files starting with 'data'
df['filename'].str.startswith('data')
# 0 False
# 1 True
# 2 False
# 3 False
# Filter PDF files
pdf_files = df[df['filename'].str.endswith('.pdf')]
Replace
Replace substrings
df = pd.DataFrame({
'phone': ['123-456-7890', '555-123-4567', '999-888-7777']
})
# Replace hyphens with spaces
df['phone'].str.replace('-', ' ')
# 0 123 456 7890
# 1 555 123 4567
# 2 999 888 7777
# Remove all hyphens
df['phone'].str.replace('-', '')
# 0 1234567890
# 1 5551234567
# 2 9998887777
# Regex replacement (remove all non-digits)
df['phone'].str.replace(r'\D', '', regex=True)
# 0 1234567890
# 1 5551234567
# 2 9998887777
# Multiple replacements
df['phone'].str.replace('-', '').str.replace(' ', '')
Splitting and Joining
Split Strings
Split strings into parts
df = pd.DataFrame({
'name': ['Alice Smith', 'Bob Jones', 'Charlie Brown']
})
# Split into list
df['name'].str.split()
# 0 [Alice, Smith]
# 1 [Bob, Jones]
# 2 [Charlie, Brown]
# Split into separate columns
df[['first_name', 'last_name']] = df['name'].str.split(expand=True)
# name first_name last_name
# 0 Alice Smith Alice Smith
# 1 Bob Jones Bob Jones
# 2 Charlie Brown Charlie Brown
# Split on specific delimiter
df = pd.DataFrame({
'email': ['alice@gmail.com', 'bob@yahoo.com']
})
df[['username', 'domain']] = df['email'].str.split('@', expand=True)
# email username domain
# 0 alice@gmail.com alice gmail.com
# 1 bob@yahoo.com bob yahoo.com
# Get specific part
df['email'].str.split('@').str[0] # Username
# 0 alice
# 1 bob
Join Strings
Combine strings
df = pd.DataFrame({
'first': ['Alice', 'Bob'],
'last': ['Smith', 'Jones']
})
# Concatenate columns
df['full_name'] = df['first'] + ' ' + df['last']
# first last full_name
# 0 Alice Smith Alice Smith
# 1 Bob Jones Bob Jones
# Join with separator
df['full_name'] = df['first'].str.cat(df['last'], sep=' ')
# Join multiple columns
df['full'] = df['first'].str.cat([df['last']], sep=' ')
Extracting Substrings
Slicing
Extract parts of string
df = pd.DataFrame({
'code': ['ABC123', 'DEF456', 'GHI789']
})
# First 3 characters
df['code'].str[:3]
# 0 ABC
# 1 DEF
# 2 GHI
# Last 3 characters
df['code'].str[-3:]
# 0 123
# 1 456
# 2 789
# Characters 2-4
df['code'].str[2:4]
# 0 C1
# 1 F4
# 2 I7
Extract with Regex
Extract patterns with regex
df = pd.DataFrame({
'text': ['Price: $100', 'Cost: $250', 'Value: $75']
})
# Extract numbers
df['text'].str.extract(r'(\d+)')
# 0
# 0 100
# 1 250
# 2 75
# Extract with named groups
df['text'].str.extract(r'(?P<label>\w+): \$(?P<amount>\d+)')
# label amount
# 0 Price 100
# 1 Cost 250
# 2 Value 75
# Extract all occurrences
df = pd.DataFrame({
'text': ['ID: 123, 456', 'ID: 789']
})
df['text'].str.extractall(r'(\d+)')
# 0
# match
# 0 0 123
# 1 456
# 1 0 789
Pattern Matching
Match
Check if pattern matches from start
df = pd.DataFrame({
'code': ['ABC123', 'XYZ456', 'ABC789']
})
# Match codes starting with ABC
df['code'].str.match(r'^ABC')
# 0 True
# 1 False
# 2 True
# Use in filtering
abc_codes = df[df['code'].str.match(r'^ABC')]
# code
# 0 ABC123
# 2 ABC789
Find Position
Find substring position
df = pd.DataFrame({
'text': ['hello world', 'test hello', 'hello']
})
# Find position of 'hello'
df['text'].str.find('hello')
# 0 0 # Found at position 0
# 1 5 # Found at position 5
# 2 0 # Found at position 0
# Returns -1 if not found
df['text'].str.find('xyz')
# 0 -1
# 1 -1
# 2 -1
Cleaning Operations
Remove Special Characters
Clean special characters
df = pd.DataFrame({
'text': ['hello!', 'world?', 'test@123']
})
# Remove punctuation
df['text'].str.replace(r'[^\w\s]', '', regex=True)
# 0 hello
# 1 world
# 2 test123
# Keep only letters
df['text'].str.replace(r'[^a-zA-Z]', '', regex=True)
# 0 hello
# 1 world
# 2 test
# Keep only digits
df['text'].str.replace(r'\D', '', regex=True)
# 0
# 1
# 2 123
Normalize Text
Standardize text format
df = pd.DataFrame({
'name': [' ALICE ', 'bob', 'Charlie ']
})
# Clean and standardize
df['name_clean'] = (df['name']
.str.strip() # Remove whitespace
.str.lower() # Lowercase
.str.title() # Title case
)
# name name_clean
# 0 ALICE Alice
# 1 bob Bob
# 2 Charlie Charlie
Remove Extra Spaces
Remove multiple spaces
df = pd.DataFrame({
'text': ['hello world', 'test data', 'a b c']
})
# Replace multiple spaces with single space
df['text'].str.replace(r'\s+', ' ', regex=True)
# 0 hello world
# 1 test data
# 2 a b c
Padding and Alignment
Pad Strings
Add padding to strings
df = pd.DataFrame({
'code': ['1', '42', '999']
})
# Pad with zeros (left)
df['code'].str.zfill(5)
# 0 00001
# 1 00042
# 2 00999
# Pad left with any character
df['code'].str.pad(5, side='left', fillchar='0')
# 0 00001
# 1 00042
# 2 00999
# Pad right
df['code'].str.pad(5, side='right', fillchar='X')
# 0 1XXXX
# 1 42XXX
# 2 999XX
# Center
df['code'].str.center(5, fillchar='-')
# 0 --1--
# 1 -42--
# 2 -999-
Working with Dates in Strings
Parse Date Components
Extract date parts from strings
df = pd.DataFrame({
'date_str': ['2024-01-15', '2024-02-20', '2024-03-10']
})
# Extract year, month, day
df['year'] = df['date_str'].str[:4]
df['month'] = df['date_str'].str[5:7]
df['day'] = df['date_str'].str[8:10]
# Or split
df[['year', 'month', 'day']] = df['date_str'].str.split('-', expand=True)
# date_str year month day
# 0 2024-01-15 2024 01 15
# 1 2024-02-20 2024 02 20
# 2 2024-03-10 2024 03 10
# Better: convert to datetime
df['date'] = pd.to_datetime(df['date_str'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
Handling Missing Values
String Operations with NaN
Handle NaN in string operations
import numpy as np
df = pd.DataFrame({
'text': ['hello', None, 'world', np.nan]
})
# String methods skip NaN by default
df['text'].str.upper()
# 0 HELLO
# 1 None
# 2 WORLD
# 3 NaN
# Fill NaN before operations
df['text'].fillna('').str.upper()
# 0 HELLO
# 1
# 2 WORLD
# 3
# Check for NaN-safe operations
df['text'].str.contains('hello', na=False)
# 0 True
# 1 False
# 2 False
# 3 False
Common Cleaning Patterns
Email Cleaning
Standardize email addresses
df = pd.DataFrame({
'email': [' ALICE@Gmail.COM ', 'bob@YAHOO.com', 'Charlie@Test.Com']
})
df['email_clean'] = (df['email']
.str.strip() # Remove whitespace
.str.lower() # Lowercase
)
# email email_clean
# 0 ALICE@Gmail.COM alice@gmail.com
# 1 bob@YAHOO.com bob@yahoo.com
# 2 Charlie@Test.Com charlie@test.com
Phone Number Cleaning
Standardize phone numbers
df = pd.DataFrame({
'phone': ['(123) 456-7890', '555.123.4567', '999-888-7777']
})
# Remove all non-digits
df['phone_clean'] = df['phone'].str.replace(r'\D', '', regex=True)
# phone phone_clean
# 0 (123) 456-7890 1234567890
# 1 555.123.4567 5551234567
# 2 999-888-7777 9998887777
# Format consistently
df['phone_formatted'] = df['phone_clean'].str.replace(
r'(\d{3})(\d{3})(\d{4})',
r'\1-\2-\3',
regex=True
)
# phone phone_clean phone_formatted
# 0 (123) 456-7890 1234567890 123-456-7890
# 1 555.123.4567 5551234567 555-123-4567
# 2 999-888-7777 9998887777 999-888-7777
Name Cleaning
Clean and standardize names
df = pd.DataFrame({
'name': [' alice smith ', 'BOB JONES', 'charlie-brown']
})
df['name_clean'] = (df['name']
.str.strip() # Remove whitespace
.str.replace('-', ' ') # Replace hyphens
.str.replace(r'\s+', ' ', regex=True) # Remove extra spaces
.str.title() # Title case
)
# name name_clean
# 0 alice smith Alice Smith
# 1 BOB JONES Bob Jones
# 2 charlie-brown Charlie Brown
URL/Domain Extraction
Extract domain from URLs
df = pd.DataFrame({
'url': ['https://www.example.com/page', 'http://test.org', 'www.site.com']
})
# Extract domain
df['domain'] = (df['url']
.str.replace(r'https?://', '', regex=True) # Remove protocol
.str.replace('www.', '') # Remove www
.str.split('/').str[0] # Get domain only
)
# url domain
# 0 https://www.example.com/page example.com
# 1 http://test.org test.org
# 2 www.site.com site.com
Performance Tips
Vectorized Operations
Efficient string operations
# Slow: iterating
result = []
for val in df['text']:
result.append(val.upper() if val else '')
df['upper'] = result
# Fast: vectorized
df['upper'] = df['text'].str.upper()
# Chain operations efficiently
df['clean'] = (df['text']
.str.strip()
.str.lower()
.str.replace(r'\s+', ' ', regex=True)
)
Quick Reference
Case conversion:
df['col'].str.lower() # Lowercase
df['col'].str.upper() # Uppercase
df['col'].str.title() # Title case
df['col'].str.capitalize() # Capitalize first
Whitespace:
df['col'].str.strip() # Remove both sides
df['col'].str.lstrip() # Remove left
df['col'].str.rstrip() # Remove right
Find/replace:
df['col'].str.contains('text') # Check contains
df['col'].str.startswith('text') # Check starts
df['col'].str.endswith('text') # Check ends
df['col'].str.replace('old', 'new') # Replace
Split/join:
df['col'].str.split() # Split to list
df['col'].str.split(expand=True) # Split to columns
df['col1'].str.cat(df['col2'], sep=' ') # Join
Extract:
df['col'].str[:3] # First 3 chars
df['col'].str[-3:] # Last 3 chars
df['col'].str.extract(r'(\d+)') # Extract pattern
Clean:
df['col'].str.replace(r'[^\w\s]', '', regex=True) # Remove special chars
df['col'].str.replace(r'\s+', ' ', regex=True) # Remove extra spaces
df['col'].str.zfill(5) # Pad with zeros
Common pattern:
# Clean text column
df['col_clean'] = (df['col']
.str.strip() # Remove whitespace
.str.lower() # Lowercase
.str.replace(r'\s+', ' ', regex=True) # Remove extra spaces
)