-
Notifications
You must be signed in to change notification settings - Fork 1
/
spark-sql-delta-2-simple.py
39 lines (33 loc) · 1.34 KB
/
spark-sql-delta-2-simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from pyspark.sql import SparkSession
import uuid
import sys
# init output_path variable
output_path = None
# get the argument of output path from the emr job parameter
if len(sys.argv) > 1:
output_path = sys.argv[1]
else:
print("S3 output location not specified, just print the output")
# use spark session instead of spark context as the entrypoint
spark = (
SparkSession.builder.appName("SparkSQL")
.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.enableHiveSupport()
.getOrCreate()
)
if output_path:
# generate random output path for testing
if output_path.endswith("/"):
url = output_path + str(uuid.uuid4())
else:
url = output_path + "/" + str(uuid.uuid4())
# creates a Delta table and outputs to target S3 bucket
spark.range(5).write.format("delta").save(url)
print("Writing range data to S3 output: " + url + ".")
# reads a Delta table and outputs to target S3 bucket (shown in logs)
spark.read.format("delta").load(url).show()
else:
spark.range(5).show()
print("No output, printing data.")