Apache Spark 03 - Read Data From File
03 - Read Data From File
Jupyter Notebook
1
2
3
import findspark
findspark.init("/opt/spark")
from pyspark.sql import SparkSession
1
2
3
4
spark = SparkSession.builder \
.appName("Read Data From File") \
.master("local[2]") \
.getOrCreate()
1
2
3
4
5
6
! curl -o datasets/Mall_Customers.csv \
https://raw.githubusercontent.com/yemrekarakas/Datasets/main/Mall_Customers.csv
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 4365 100 4365 0 0 5321 0 --:--:-- --:--:-- --:--:-- 5336
1
2
3
! ls -l datasets | grep Mall
-rw-rw-r--. 1 yek yek 4365 Apr 23 14:04 Mall_Customers.csv
1
2
3
4
5
6
7
df = spark.read.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")
# other option
# df = spark.read.format("csv").load(path="/home/yek/spark_local/datasets/Mall_Customers.csv")
# another option
# df = spark.read.load(path="/home/yek/spark_local/datasets/Mall_Customers.csv", format="csv", header=True, inferShema=True)
1
2
3
4
5
6
7
8
9
10
11
12
13
df.show(5)
+----------+------+---+------------+-------------+
| _c0| _c1|_c2| _c3| _c4|
+----------+------+---+------------+-------------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|
| 1| Male| 19| 15000| 39|
| 2| Male| 21| 15000| 81|
| 3|Female| 20| 16000| 6|
| 4|Female| 23| 16000| 77|
+----------+------+---+------------+-------------+
only showing top 5 rows
1
2
3
df.count()
201
1
df.limit(5).toPandas()
_c0 | _c1 | _c2 | _c3 | _c4 | |
---|---|---|---|---|---|
0 | CustomerID | Gender | Age | AnnualIncome | SpendingScore |
1 | 1 | Male | 19 | 15000 | 39 |
2 | 2 | Male | 21 | 15000 | 81 |
3 | 3 | Female | 20 | 16000 | 6 |
4 | 4 | Female | 23 | 16000 | 77 |
Header Option
1
2
3
df2 = spark.read \
.option("header", "True") \
.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")
1
2
3
4
5
6
7
8
9
10
11
12
13
df2.show(5)
+----------+------+---+------------+-------------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|
+----------+------+---+------------+-------------+
| 1| Male| 19| 15000| 39|
| 2| Male| 21| 15000| 81|
| 3|Female| 20| 16000| 6|
| 4|Female| 23| 16000| 77|
| 5|Female| 31| 17000| 40|
+----------+------+---+------------+-------------+
only showing top 5 rows
1
df2.limit(5).toPandas()
CustomerID | Gender | Age | AnnualIncome | SpendingScore | |
---|---|---|---|---|---|
0 | 1 | Male | 19 | 15000 | 39 |
1 | 2 | Male | 21 | 15000 | 81 |
2 | 3 | Female | 20 | 16000 | 6 |
3 | 4 | Female | 23 | 16000 | 77 |
4 | 5 | Female | 31 | 17000 | 40 |
1
2
3
4
5
6
7
8
9
df2.printSchema()
root
|-- CustomerID: string (nullable = true)
|-- Gender: string (nullable = true)
|-- Age: string (nullable = true)
|-- AnnualIncome: string (nullable = true)
|-- SpendingScore: string (nullable = true)
InferSchema Option
1
2
3
4
df3 = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")
1
2
3
4
5
6
7
8
9
df3.printSchema()
root
|-- CustomerID: integer (nullable = true)
|-- Gender: string (nullable = true)
|-- Age: integer (nullable = true)
|-- AnnualIncome: integer (nullable = true)
|-- SpendingScore: integer (nullable = true)
Seperator Option - default comma(,)
1
2
3
4
5
df4 = spark.read \
.option("header", "True") \
.option("inferSchema", "True") \
.option("sep", ",") \
.csv("file:///home/yek/spark_local/datasets/Mall_Customers.csv")
1
2
3
4
5
6
7
8
9
10
11
12
13
df4.show(5)
+----------+------+---+------------+-------------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|
+----------+------+---+------------+-------------+
| 1| Male| 19| 15000| 39|
| 2| Male| 21| 15000| 81|
| 3|Female| 20| 16000| 6|
| 4|Female| 23| 16000| 77|
| 5|Female| 31| 17000| 40|
+----------+------+---+------------+-------------+
only showing top 5 rows
1
df4.limit(5).toPandas()
CustomerID | Gender | Age | AnnualIncome | SpendingScore | |
---|---|---|---|---|---|
0 | 1 | Male | 19 | 15000 | 39 |
1 | 2 | Male | 21 | 15000 | 81 |
2 | 3 | Female | 20 | 16000 | 6 |
3 | 4 | Female | 23 | 16000 | 77 |
4 | 5 | Female | 31 | 17000 | 40 |
1
spark.stop()
This post is licensed under CC BY 4.0 by the author.