Technology  /  Apache Spark

Apache Spark 49 guides · updated 2026

Distributed data processing at scale — RDDs, DataFrames, Structured Streaming, and the tuning techniques that keep Spark jobs fast and cheap.

Apache Spark DataFrames

A DataFrame is Spark’s primary abstraction for structured data — a distributed table with named columns and a schema. Unlike RDDs, DataFrames are optimized by the Catalyst query optimizer, which generates efficient execution plans regardless of whether you use Python, Scala, or SQL.


Creating DataFrames

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
spark = SparkSession.builder.appName("DataFrame Demo").getOrCreate()
# From a Python list
data = [
("Alice", "Engineering", 95000),
("Bob", "Marketing", 72000),
("Carol", "Engineering", 110000),
("Dave", "HR", 65000),
]
df = spark.createDataFrame(data, ["name", "department", "salary"])
# With explicit schema
schema = StructType([
StructField("name", StringType(), nullable=False),
StructField("department", StringType(), nullable=True),
StructField("salary", IntegerType(), nullable=True),
])
df = spark.createDataFrame(data, schema)
# From files
df_csv = spark.read.option("header", True).option("inferSchema", True).csv("data/employees.csv")
df_json = spark.read.json("data/events/*.json")
df_parquet = spark.read.parquet("s3://bucket/transactions/")
df_delta = spark.read.format("delta").load("s3://bucket/delta-table/")

Schema and Inspection

df.printSchema()
# root
# |-- name: string (nullable = false)
# |-- department: string (nullable = true)
# |-- salary: integer (nullable = true)
df.show(5)
df.show(5, truncate=False)
df.dtypes # [('name', 'string'), ('department', 'string'), ('salary', 'int')]
df.columns # ['name', 'department', 'salary']
df.count() # 4
df.describe().show() # Summary stats

Transformations

from pyspark.sql import functions as F
# Select columns
df.select("name", "salary")
df.select(F.col("name"), F.col("salary") * 1.1)
# Filter rows
df.filter(df.salary > 80000)
df.filter(F.col("department") == "Engineering")
df.where("salary BETWEEN 70000 AND 120000")
# Add / rename columns
df.withColumn("salary_bonus", F.col("salary") * 0.1)
df.withColumnRenamed("salary", "annual_salary")
# Drop columns
df.drop("department")
# Sort
df.orderBy(F.col("salary").desc())
df.sort("department", F.col("salary").desc())
# Limit
df.limit(10)

Aggregations and Grouping

# Group by + aggregate
dept_stats = df.groupBy("department").agg(
F.count("name").alias("headcount"),
F.avg("salary").alias("avg_salary"),
F.max("salary").alias("max_salary"),
F.sum("salary").alias("total_salary")
)
dept_stats.show()
# +------------+---------+----------+----------+------------+
# |department |headcount|avg_salary|max_salary|total_salary|
# +------------+---------+----------+----------+------------+
# |Engineering |2 |102500.0 |110000 |205000 |
# |Marketing |1 |72000.0 |72000 |72000 |
# |HR |1 |65000.0 |65000 |65000 |
# +------------+---------+----------+----------+------------+
# Window functions
from pyspark.sql.window import Window
window = Window.partitionBy("department").orderBy(F.col("salary").desc())
df.withColumn("rank_in_dept", F.rank().over(window)).show()

SQL Queries

# Register as temporary view
df.createOrReplaceTempView("employees")
# Run SQL
result = spark.sql("""
SELECT
department,
COUNT(*) as headcount,
ROUND(AVG(salary), 0) as avg_salary
FROM employees
WHERE salary > 60000
GROUP BY department
ORDER BY avg_salary DESC
""")
result.show()

Joins

departments = spark.createDataFrame([
("Engineering", "Building A"),
("Marketing", "Building B"),
("HR", "Building C"),
], ["dept_name", "location"])
# Inner join
df.join(departments, df.department == departments.dept_name, "inner")
# Left join
df.join(departments, df.department == departments.dept_name, "left")
# Broadcast join (small table — avoids shuffle)
from pyspark.sql.functions import broadcast
df.join(broadcast(departments), df.department == departments.dept_name)

Writing Data

# Write to various formats
df.write.mode("overwrite").parquet("s3://bucket/output/")
df.write.mode("append").json("hdfs://output/json/")
df.write.option("header", True).csv("output/employees.csv")
# Partition by column (improves read performance)
df.write.partitionBy("department").mode("overwrite").parquet("s3://bucket/partitioned/")
# Delta Lake (2025 standard for lakehouse)
df.write.format("delta").mode("overwrite").save("s3://bucket/delta/employees/")

Performance Tips

# Cache a frequently reused DataFrame
df.cache()
df.persist() # Explicit storage level
# Unpersist when done
df.unpersist()
# Repartition for parallelism
df.repartition(200) # Shuffle — even distribution
df.coalesce(4) # No shuffle — reduce partitions
# Avoid UDFs when built-in functions exist
# BAD: df.withColumn("upper_name", F.udf(lambda x: x.upper())(F.col("name")))
# GOOD:
df.withColumn("upper_name", F.upper(F.col("name")))