Dataframe create from file

 

from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
  .master("local[1]") \
  .appName("npblue.com") \
  .getOrCreate()


filePath = "/Users/npblue/PycharmProjects/learning/data/SP500.csv"

df = spark.sparkContext.textFile(filePath)

df.foreach(print)

Output

 

Date,Open,High,Low,Close,Adj Close,Volume
1950-01-03,16.660000,16.660000,16.660000,16.660000,16.660000,1260000
1950-01-04,16.850000,16.850000,16.850000,16.850000,16.850000,1890000
1950-01-05,16.930000,16.930000,16.930000,16.930000,16.930000,2550000
1950-01-06,16.980000,16.980000,16.980000,16.980000,16.980000,2010000
1950-01-09,17.080000,17.080000,17.080000,17.080000,17.080000,2520000
1950-01-10,17.030001,17.030001,17.030001,17.030001,17.030001,2160000
1950-01-11,17.090000,17.090000,17.090000,17.090000,17.090000,2630000
1950-01-12,16.760000,16.760000,16.760000,16.760000,16.760000,2970000
1950-01-13,16.670000,16.670000,16.670000,16.670000,16.670000,3330000