Creating Spark DataFrames
Spark DataFrames can be created from many sources. Choosing the right creation method and providing an explicit schema avoids schema inference overhead and runtime type errors.
From a Python List
from pyspark.sql import SparkSessionfrom pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
spark = SparkSession.builder.appName("DataFrame Creation").getOrCreate()
# From list of tuples — infer column namesdata = [("Alice", "Engineering", 95000), ("Bob", "Marketing", 72000)]df1 = spark.createDataFrame(data, ["name", "department", "salary"])
# With explicit schema — recommended for productionschema = StructType([ StructField("name", StringType(), nullable=False), StructField("department", StringType(), nullable=True), StructField("salary", IntegerType(), nullable=True),])df2 = spark.createDataFrame(data, schema)
# From list of dictsrecords = [ {"name": "Alice", "score": 92.5, "passed": True}, {"name": "Bob", "score": 78.3, "passed": True},]df3 = spark.createDataFrame(records)From Pandas DataFrame
import pandas as pd
pdf = pd.DataFrame({ "name": ["Alice", "Bob", "Carol"], "salary": [95000, 72000, 110000], "active": [True, False, True],})
# Convert pandas → Spark (Arrow enabled for speed)spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")df = spark.createDataFrame(pdf)df.printSchema()From CSV
# Basicdf = spark.read.csv("employees.csv", header=True, inferSchema=True)
# Production — explicit schema, no inferenceschema = StructType([ StructField("id", IntegerType()), StructField("name", StringType()), StructField("salary", DoubleType()),])df = spark.read \ .schema(schema) \ .option("header", "true") \ .option("nullValue", "N/A") \ .option("mode", "DROPMALFORMED") \ .csv("s3://bucket/employees/*.csv")From JSON
# Newline-delimited JSON (one JSON object per line — default)df = spark.read.json("s3://bucket/events/*.json")
# Multi-line JSON (one array or object spanning multiple lines)df = spark.read.option("multiLine", "true").json("data/config.json")
# With schemadf = spark.read.schema(schema).json("events.json")From Parquet
# Parquet embeds schema — no inference neededdf = spark.read.parquet("s3://bucket/transactions/")
# With filters pushed into file readsdf = spark.read \ .parquet("s3://bucket/sales/") \ .filter("year = 2025 AND region = 'APAC'")From a Database (JDBC)
df = spark.read \ .format("jdbc") \ .option("url", "jdbc:postgresql://host:5432/db") \ .option("dbtable", "public.orders") \ .option("user", "spark") \ .option("password", "secret") \ .option("numPartitions", "20") \ .option("partitionColumn", "order_id") \ .option("lowerBound", "1") \ .option("upperBound", "10000000") \ .load()From an RDD
rdd = spark.sparkContext.parallelize([ ("Alice", 95000), ("Bob", 72000)])
# Convert RDD to DataFramedf = rdd.toDF(["name", "salary"])
# With Row objectsfrom pyspark.sql import Rowrows = rdd.map(lambda t: Row(name=t[0], salary=t[1]))df = spark.createDataFrame(rows)