ml_pipeline_cookiecutter/{{cookiecutter.project_slug}}/{{cookiecutter.module_name}}/data/spark.py

22 lines
713 B
Python

#!/usr/bin/env python3
from sys import stdout
import csv
# 'pip install pyspark' for these
from pyspark import SparkFiles
from pyspark.sql import SparkSession
# make a spark "session". this creates a local hadoop cluster by default (!)
spark = SparkSession.builder.getOrCreate()
# put the input file in the cluster's filesystem:
spark.sparkContext.addFile("https://csvbase.com/meripaterson/stock-exchanges.csv")
# the following is much like for pandas
df = (
spark.read.csv(f"file://{SparkFiles.get('stock-exchanges.csv')}", header=True)
.select("MIC")
.na.drop()
.sort("MIC")
)
# pyspark has no easy way to write csv to stdout - use python's csv lib
csv.writer(stdout).writerows(df.collect())