# if not installed already
pip install pyspark
pip install findspark
# Import general spark libraries
import findspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# Import Spark ML libraries
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
findspark.init()
Create Spark Session
Save & Load SparkML Model
Objectives
In this section we will:
- Create a simple Linear Regression Model
- Save the SparkML model
- Load the SparkML model
- Make predictions using the loaded SparkML model
Setup
Create Spark Session & Context
# Create a spark context class
= SparkContext()
sc
# Create a spark session
= SparkSession \
spark \
.builder "Saving and Loading a SparkML Model").getOrCreate()
.appName(
# Check context
sc# __ OUTPUT
SparkContext
Spark UI.4.3Masterlocal[*]AppNamepyspark-shell Versionv2
Analyze
Create DF with Sample Data
# Create a simple data set of infant height(cms) weight(kgs) chart.
= [[46,2.5],[51,3.4],[54,4.4],[57,5.1],[60,5.6],[61,6.1],[63,6.4]]
mydata
# Mention column names of dataframe
= ["height", "weight"]
columns
# creating a dataframe
= spark.createDataFrame(mydata, columns)
mydf
# show data frame
mydf.show()
+------+------+
|height|weight|
+------+------+
| 46| 2.5|
| 51| 3.4|
| 54| 4.4|
| 57| 5.1|
| 60| 5.6|
| 61| 6.1|
| 63| 6.4|
+------+------+
Convert Columns to Feature Vectors
We use the VectorAssembler()
function to convert the dataframe columns into feature vectors.
- For our example, we use the height of the car as input features and
- The weight as target labels
= VectorAssembler(
assembler =["height"],
inputCols="features")
outputCol
= assembler.transform(mydf).select('features','weight')
data
data.show()+--------+------+
|features|weight|
+--------+------+
| [46.0]| 2.5|
| [51.0]| 3.4|
| [54.0]| 4.4|
| [57.0]| 5.1|
| [60.0]| 5.6|
| [61.0]| 6.1|
| [63.0]| 6.4|
+--------+------+
Create & Train LR Model
# Create a LR model
= LinearRegression(featuresCol='features', labelCol='weight', maxIter=100)
lr 0.1)
lr.setRegParam(# Fit the model
= lr.fit(data) lrModel
Save LR Model
'infantheight2.model') lrModel.save(
Load LR Model
# You need LinearRegressionModel to load the model
from pyspark.ml.regression import LinearRegressionModel
= LinearRegressionModel.load('infantheight2.model') model
Make Predictions
1- Predict W from H
Predict the weight of an infant whose height is 70 CMs.
Create Function
# This function converts a scalar number into a dataframe that can be used by the model to predict.
def predict(weight):
= VectorAssembler(inputCols=["weight"],outputCol="features")
assembler = [[weight,0]]
data = ["weight", "height"]
columns = spark.createDataFrame(data, columns)
_ = assembler.transform(_).select('features','height')
__ = model.transform(__)
predictions 'prediction').show() predictions.select(
Predict
70)
predict(
+-----------------+
| prediction|
+-----------------+
|7.863454719775907|
+-----------------+
2- Predict W from H
Let’s rename the model another name and predict the weight for another height, 50cms
Rename LR Model
- We just use the save command with a new name
'babyweightprediction.model') lrModel.save(
Load Model
= LinearRegressionModel.load('babyweightprediction.model') model
Predict W
50)
predict(
+------------------+
| prediction|
+------------------+
|3.4666826711164465|
+------------------+