Practical Techniques
Real-World CSV Data Processing Strategies
1. Large File Handling
import pandas as pd
## Processing large CSV files in chunks
chunk_size = 10000
for chunk in pd.read_csv('large_dataset.csv', chunksize=chunk_size):
## Process each chunk
processed_chunk = chunk[chunk['age'] > 25]
processed_chunk.to_csv('filtered_data.csv', mode='a', header=False)
2. Data Validation Techniques
def validate_csv_data(df):
## Check data types
expected_types = {
'name': str,
'age': int,
'salary': float
}
for column, dtype in expected_types.items():
if not pd.api.types.is_dtype_equal(df[column].dtype, dtype):
raise ValueError(f"Invalid data type for column {column}")
## Additional validation rules
if (df['age'] < 0).any():
raise ValueError("Age cannot be negative")
return df
graph TD
A[Raw Data] --> B{Validation}
B -->|Pass| C[Transform]
B -->|Fail| D[Error Handling]
C --> E[Export/Analyze]
3. Complex Data Cleaning
def clean_data(df):
## Remove duplicate entries
df.drop_duplicates(inplace=True)
## Standardize text columns
df['name'] = df['name'].str.strip().str.title()
## Handle missing values
df['salary'].fillna(df['salary'].median(), inplace=True)
return df
Common CSV Processing Scenarios
Scenario |
Technique |
Example |
Deduplication |
Remove duplicates |
df.drop_duplicates() |
Data Normalization |
Standardize values |
df['column'].str.lower() |
Outlier Detection |
Identify extreme values |
df[np.abs(df['age'] - df['age'].mean()) <= (3 * df['age'].std())] |
import numpy as np
import pandas as pd
def optimize_csv_processing(df):
## Convert object columns to categorical
categorical_columns = ['city', 'department']
for col in categorical_columns:
df[col] = pd.Categorical(df[col])
## Use more efficient data types
df['age'] = pd.to_numeric(df['age'], downcast='integer')
df['salary'] = pd.to_numeric(df['salary'], downcast='float')
return df
def advanced_data_processing(df):
## Complex filtering with multiple conditions
target_df = df[
(df['age'].between(25, 40)) &
(df['salary'] > df['salary'].median()) &
(df['city'].isin(['New York', 'San Francisco']))
]
## Create derived features
target_df['salary_tier'] = pd.cut(
target_df['salary'],
bins=[0, 50000, 100000, float('inf')],
labels=['Low', 'Medium', 'High']
)
return target_df
Error Handling and Logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def process_csv_safely(input_file, output_file):
try:
df = pd.read_csv(input_file)
processed_df = clean_data(df)
processed_df.to_csv(output_file, index=False)
logger.info(f"Successfully processed {input_file}")
except Exception as e:
logger.error(f"Error processing {input_file}: {str(e)}")
Best Practices
- Always validate input data
- Use memory-efficient techniques
- Handle errors gracefully
- Log processing steps
At LabEx, we recommend mastering these practical techniques to become proficient in CSV data processing.