Post

Apache Spark 03 - Read Data From File

03 - Read Data From File

Jupyter Notebook

1
2
3
import findspark
findspark.init("/opt/spark")
from pyspark.sql import SparkSession
1
2
3
4
spark = SparkSession.builder \
.appName("Read Data From File") \
.master("local[2]") \
.getOrCreate()
1
2
3
4
5
6
! curl -o datasets/Mall_Customers.csv \
https://raw.githubusercontent.com/yemrekarakas/Datasets/main/Mall_Customers.csv

      % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                     Dload  Upload   Total   Spent    Left  Speed
    100  4365  100  4365    0     0   5321      0 --:--:-- --:--:-- --:--:--  5336
1
2
3
! ls -l datasets | grep Mall

    -rw-rw-r--. 1 yek yek 4365 Apr 23 14:04 Mall_Customers.csv
1
2
3
4
5
6
7
df = spark.read.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")

# other option
# df = spark.read.format("csv").load(path="/home/yek/spark_local/datasets/Mall_Customers.csv")

# another option
# df = spark.read.load(path="/home/yek/spark_local/datasets/Mall_Customers.csv", format="csv", header=True, inferShema=True)
1
2
3
4
5
6
7
8
9
10
11
12
13
df.show(5)

    +----------+------+---+------------+-------------+
    |       _c0|   _c1|_c2|         _c3|          _c4|
    +----------+------+---+------------+-------------+
    |CustomerID|Gender|Age|AnnualIncome|SpendingScore|
    |         1|  Male| 19|       15000|           39|
    |         2|  Male| 21|       15000|           81|
    |         3|Female| 20|       16000|            6|
    |         4|Female| 23|       16000|           77|
    +----------+------+---+------------+-------------+
    only showing top 5 rows

1
2
3
df.count()

    201
1
df.limit(5).toPandas()
_c0_c1_c2_c3_c4
0CustomerIDGenderAgeAnnualIncomeSpendingScore
11Male191500039
22Male211500081
33Female20160006
44Female231600077

Header Option

1
2
3
df2 = spark.read \
.option("header", "True") \
.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")
1
2
3
4
5
6
7
8
9
10
11
12
13
df2.show(5)

    +----------+------+---+------------+-------------+
    |CustomerID|Gender|Age|AnnualIncome|SpendingScore|
    +----------+------+---+------------+-------------+
    |         1|  Male| 19|       15000|           39|
    |         2|  Male| 21|       15000|           81|
    |         3|Female| 20|       16000|            6|
    |         4|Female| 23|       16000|           77|
    |         5|Female| 31|       17000|           40|
    +----------+------+---+------------+-------------+
    only showing top 5 rows

1
df2.limit(5).toPandas()
CustomerIDGenderAgeAnnualIncomeSpendingScore
01Male191500039
12Male211500081
23Female20160006
34Female231600077
45Female311700040
1
2
3
4
5
6
7
8
9
df2.printSchema()

    root
     |-- CustomerID: string (nullable = true)
     |-- Gender: string (nullable = true)
     |-- Age: string (nullable = true)
     |-- AnnualIncome: string (nullable = true)
     |-- SpendingScore: string (nullable = true)

InferSchema Option

1
2
3
4
df3 = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")
1
2
3
4
5
6
7
8
9
df3.printSchema()

    root
     |-- CustomerID: integer (nullable = true)
     |-- Gender: string (nullable = true)
     |-- Age: integer (nullable = true)
     |-- AnnualIncome: integer (nullable = true)
     |-- SpendingScore: integer (nullable = true)

Seperator Option - default comma(,)

1
2
3
4
5
df4 = spark.read \
.option("header", "True") \
.option("inferSchema", "True") \
.option("sep", ",") \
.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")
1
2
3
4
5
6
7
8
9
10
11
12
13
df4.show(5)

    +----------+------+---+------------+-------------+
    |CustomerID|Gender|Age|AnnualIncome|SpendingScore|
    +----------+------+---+------------+-------------+
    |         1|  Male| 19|       15000|           39|
    |         2|  Male| 21|       15000|           81|
    |         3|Female| 20|       16000|            6|
    |         4|Female| 23|       16000|           77|
    |         5|Female| 31|       17000|           40|
    +----------+------+---+------------+-------------+
    only showing top 5 rows

1
df4.limit(5).toPandas()
CustomerIDGenderAgeAnnualIncomeSpendingScore
01Male191500039
12Male211500081
23Female20160006
34Female231600077
45Female311700040
1
spark.stop()
This post is licensed under CC BY 4.0 by the author.