Dataframe show() vs take() methods

 

from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
  .master("local[1]") \
  .appName("npblue.com") \
  .getOrCreate()


data = [('Narender','','Paul','1991-04-01','M',3000),
  ('John ','Rose','','2000-05-19','M',4000),
  ('Hello','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jeo','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)

 

df.show(2)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|   dob|gender|salary|
+---------+----------+--------+----------+------+------+
| Narender|  |Paul|1991-04-01| M|  3000|
|John |  Rose||2000-05-19| M|  4000|
+---------+----------+--------+----------+------+------+
only showing top 2 rows

vs

 
print(df.take(2))


[Row(firstname='Narender', middlename='', lastname='Paul', dob='1991-04-01', gender='M', salary=3000), Row(firstname='John ', middlename='Rose', lastname='', dob='2000-05-19', gender='M', salary=4000)]