Master data manipulation with Polars, the blazing-fast DataFrame library.
Polars is a DataFrame library for Python that’s designed for speed and efficiency.
Fast ⚡
Memory Efficient 💾
Expressive API 🎯
Type-Safe ✅
| Feature | Pandas | Polars |
|---|---|---|
| Speed | Good | Excellent (5-10x faster) |
| Memory | Higher | Lower |
| Syntax | Method chaining | Method chaining + expressions |
| Parallel | Limited | Full parallelism |
| Type safety | Weak | Strong |
| Learning curve | Gentle | Moderate |
For this course: We use Polars because it’s the modern standard and teaches good data habits!
import polars as pl
# Basic read
df = pl.read_csv("data/raw/students.csv")
# With options
df = pl.read_csv(
"data/raw/students.csv",
null_values=["NA", ""], # Treat these as null
dtypes={"student_id": pl.Int64}, # Specify column types
n_rows=100 # Read only first 100 rows
)
# JSON array of objects
df = pl.read_json("data/raw/sales.json")
# Newline-delimited JSON
df = pl.read_ndjson("data/raw/logs.ndjson")
# Parquet (efficient binary format)
df = pl.read_parquet("data/raw/weather.parquet")
df = pl.read_csv("https://example.com/data.csv")
import polars as pl
df = pl.read_csv("data/raw/students.csv")
# View first rows
df.head() # First 5 rows
df.head(10) # First 10 rows
# View last rows
df.tail() # Last 5 rows
# Get shape
rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")
# Get column names
df.columns # ['student_id', 'name', 'age', ...]
# Get data types
df.dtypes # Schema with types
df.schema # Dictionary of column → type
# Summary statistics
df.describe() # Count, mean, std, min, max, etc.
# Basic info
print(df) # Shows first few rows
# Detailed schema
df.schema
# {'student_id': Int64, 'name': String, 'age': Int64, ...}
# Sample random rows
df.sample(n=5) # 5 random rows
df.sample(fraction=0.1) # 10% of data
# Check for nulls
df.null_count() # Nulls per column
# Single column (returns DataFrame)
df.select("name")
# Multiple columns
df.select("name", "age", "grade_level")
# Using a list
columns = ["name", "age"]
df.select(columns)
# Using pl.col() expression
df.select(
pl.col("name"),
pl.col("test_score")
)
# All columns starting with 'test'
df.select(pl.col("^test.*$"))
# All numeric columns
df.select(pl.col(pl.Int64, pl.Float64))
# Everything except one column
df.select(pl.exclude("student_id"))
# Rename while selecting
df.select(
pl.col("name").alias("student_name"),
pl.col("test_score").alias("score")
)
# Rename in place
df.rename({"name": "student_name", "age": "student_age"})
# Single condition
df.filter(pl.col("age") > 15)
# Multiple conditions with &
df.filter(
(pl.col("age") > 15) &
(pl.col("test_score") > 80)
)
# Multiple conditions with |
df.filter(
(pl.col("grade_level") == 10) |
(pl.col("grade_level") == 11)
)
# Greater than
df.filter(pl.col("test_score") > 75)
# Less than or equal
df.filter(pl.col("age") <= 16)
# Equal to
df.filter(pl.col("subject") == "Mathematics")
# Not equal
df.filter(pl.col("subject") != "Art")
# In a list
df.filter(pl.col("grade_level").is_in([10, 11, 12]))
# Null values
df.filter(pl.col("test_score").is_null())
df.filter(pl.col("test_score").is_not_null())
# String operations
df.filter(pl.col("name").str.contains("John"))
df.filter(pl.col("name").str.starts_with("A"))
# Between values
df.filter(
pl.col("test_score").is_between(70, 90)
)
# Not in list
df.filter(
~pl.col("subject").is_in(["Art", "Music"])
)
# Multiple AND conditions
df.filter(
(pl.col("age") > 14) &
(pl.col("attendance_rate") > 90) &
(pl.col("test_score") > 85)
)
# Sort by one column (ascending)
df.sort("test_score")
# Sort descending
df.sort("test_score", descending=True)
# Sort by multiple columns
df.sort(["grade_level", "test_score"])
# Mixed sorting
df.sort(
"grade_level", # Ascending
"test_score", # Ascending
descending=[False, True] # test_score descending
)
# Sort by calculated value
df.sort(pl.col("test_score") / pl.col("attendance_rate"))
# Sort nulls first
df.sort("test_score", nulls_last=False)
# Add new column
df.with_columns(
pl.col("test_score").alias("score_copy")
)
# Calculate new column
df.with_columns(
(pl.col("test_score") * 1.1).alias("bonus_score")
)
# Multiple new columns
df.with_columns(
pl.col("test_score").round(0).alias("rounded_score"),
(pl.col("age") + 1).alias("next_year_age"),
pl.lit("2024").alias("year") # Literal value
)
# If-else logic with when/then/otherwise
df.with_columns(
pl.when(pl.col("test_score") >= 90)
.then(pl.lit("A"))
.when(pl.col("test_score") >= 80)
.then(pl.lit("B"))
.when(pl.col("test_score") >= 70)
.then(pl.lit("C"))
.otherwise(pl.lit("F"))
.alias("grade")
)
# Uppercase
df.with_columns(
pl.col("name").str.to_uppercase().alias("name_upper")
)
# Extract parts
df.with_columns(
pl.col("name").str.split(" ").list.get(0).alias("first_name")
)
# Replace
df.with_columns(
pl.col("subject").str.replace("Math", "Mathematics").alias("subject_full")
)
# Count by group
df.group_by("grade_level").count()
# Single aggregation
df.group_by("subject").agg(
pl.mean("test_score")
)
# Multiple aggregations
df.group_by("grade_level").agg(
pl.mean("test_score").alias("avg_score"),
pl.max("test_score").alias("max_score"),
pl.count().alias("num_students")
)
df.group_by("subject").agg(
pl.count().alias("count"), # Count rows
pl.sum("test_score").alias("total"), # Sum
pl.mean("test_score").alias("average"), # Mean
pl.median("test_score").alias("median"), # Median
pl.std("test_score").alias("std_dev"), # Standard deviation
pl.min("test_score").alias("minimum"), # Minimum
pl.max("test_score").alias("maximum"), # Maximum
pl.first("test_score").alias("first"), # First value
pl.last("test_score").alias("last") # Last value
)
df.group_by(["grade_level", "subject"]).agg(
pl.mean("test_score").alias("avg_score"),
pl.count().alias("num_students")
)
# Group, aggregate, then filter
(
df.group_by("subject")
.agg(pl.mean("test_score").alias("avg_score"))
.filter(pl.col("avg_score") > 80)
)
students = pl.read_csv("data/raw/students.csv")
grades = pl.DataFrame({
"student_id": [1, 2, 3, 4, 5],
"semester": ["Fall", "Fall", "Spring", "Fall", "Spring"],
"gpa": [3.5, 3.8, 3.2, 3.9, 3.6]
})
# Keep only matching rows from both
students.join(grades, on="student_id", how="inner")
# Keep all rows from left, add matching from right
students.join(grades, on="student_id", how="left")
# Inner: Only matching rows
students.join(grades, on="student_id", how="inner")
# Left: All from left, matching from right
students.join(grades, on="student_id", how="left")
# Outer: All rows from both
students.join(grades, on="student_id", how="outer")
# Cross: Cartesian product (every combination)
students.join(grades, how="cross")
# When column names differ
students.join(
grades,
left_on="student_id",
right_on="id",
how="left"
)
# Count nulls per column
df.null_count()
# Filter rows with nulls
df.filter(pl.col("test_score").is_null())
# Filter rows without nulls
df.filter(pl.col("test_score").is_not_null())
# Drop rows with any null
df.drop_nulls()
# Drop rows with null in specific column
df.drop_nulls(subset=["test_score"])
# Drop rows with null in multiple columns
df.drop_nulls(subset=["test_score", "attendance_rate"])
# Fill with a value
df.with_columns(
pl.col("test_score").fill_null(0).alias("test_score")
)
# Fill with mean
mean_score = df.select(pl.mean("test_score")).item()
df.with_columns(
pl.col("test_score").fill_null(mean_score)
)
# Forward fill (use previous value)
df.with_columns(
pl.col("test_score").fill_null(strategy="forward")
)
# Backward fill (use next value)
df.with_columns(
pl.col("test_score").fill_null(strategy="backward")
)
Polars expressions are powerful and composable!
# Reference a column
pl.col("age")
# Multiple columns
pl.col("age", "test_score")
# All columns
pl.all()
# Columns matching pattern
pl.col("^test_.*$") # Regex pattern
# Chain multiple operations
df.select(
pl.col("test_score")
.fill_null(0) # Fill nulls
.clip(0, 100) # Clip to range
.round(1) # Round to 1 decimal
.alias("clean_score") # Rename
)
# Rank within groups
df.with_columns(
pl.col("test_score")
.rank()
.over("grade_level")
.alias("rank_in_grade")
)
# Running sum
df.with_columns(
pl.col("test_score")
.cum_sum()
.alias("cumulative_score")
)
Polars is designed for readable method chains!
result = (
pl.read_csv("data/raw/students.csv")
.filter(pl.col("age") > 14) # Filter young students
.with_columns(
(pl.col("test_score") * 1.1).alias("bonus_score") # Add bonus
)
.group_by("grade_level") # Group by grade
.agg(
pl.mean("bonus_score").alias("avg_bonus"),
pl.count().alias("num_students")
)
.sort("avg_bonus", descending=True) # Sort by average
.head(5) # Top 5
)
For large datasets, use lazy mode for better performance!
# Start lazy
lazy_df = pl.scan_csv("large_file.csv")
# Build query (doesn't execute yet)
lazy_query = (
lazy_df
.filter(pl.col("age") > 18)
.group_by("category")
.agg(pl.sum("value"))
)
# Execute when ready
result = lazy_query.collect()
import polars as pl
# Reading
df = pl.read_csv("file.csv")
df = pl.read_json("file.json")
df = pl.read_parquet("file.parquet")
# Viewing
df.head()
df.tail()
df.describe()
df.schema
# Selecting
df.select("col1", "col2")
df.select(pl.col("col1"))
# Filtering
df.filter(pl.col("age") > 18)
df.filter((pl.col("a") > 5) & (pl.col("b") < 10))
# Sorting
df.sort("col1")
df.sort("col1", descending=True)
# Adding columns
df.with_columns(
(pl.col("a") * 2).alias("a_doubled")
)
# Grouping
df.group_by("category").agg(pl.mean("value"))
# Joining
df1.join(df2, on="id", how="left")
# Writing
df.write_csv("output.csv")
df.write_json("output.json")
df.write_parquet("output.parquet")
| Operation | Pandas | Polars |
|---|---|---|
| Read CSV | pd.read_csv() |
pl.read_csv() |
| Filter | df[df['age'] > 18] |
df.filter(pl.col("age") > 18) |
| Add column | df['new'] = df['a'] * 2 |
df.with_columns((pl.col("a") * 2).alias("new")) |
| Group by | df.groupby('cat')['val'].mean() |
df.group_by("cat").agg(pl.mean("val")) |
| Sort | df.sort_values('col') |
df.sort("col") |
Try this on the students dataset:
import polars as pl
# Load data
df = pl.read_csv("data/raw/students.csv")
# Challenge: Find the average test score by grade level,
# but only for students with > 85% attendance
result = (
df
# Your code here!
)
Solution:
result = (
df
.filter(pl.col("attendance_rate") > 85)
.group_by("grade_level")
.agg(pl.mean("test_score").alias("avg_score"))
.sort("grade_level")
)
example_notebooks/02_data_wrangling.pyexercises/ex02_wrangle_and_plot.pyOfficial Resources:
Polars makes data wrangling fast and elegant. Happy analyzing! 🐻❄️