Bicycle Sharing Demand Prediction Using Apache Spark and Scala
Final project code + output Bicycle Sharing Demand Prediction Using Apache Spark and Scala for certification.
run the each machine learning and notice the accuracy of the each machine learning.
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.util.IntParam
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.log4j._
import org.apache.spark.sql.functions.to_timestamp
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.OneHotEncoder
import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
import org.apache.spark.ml.regression.DecisionTreeRegressor
+----------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
| datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|
+----------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
|01-01-2011 00:00| 1| 0| 0| 1| 9.84|14.395| 81| 0.0| 3| 13| 16|
|01-01-2011 01:00| 1| 0| 0| 1| 9.02|13.635| 80| 0.0| 8| 32| 40|
|01-01-2011 02:00| 1| 0| 0| 1| 9.02|13.635| 80| 0.0| 5| 27| 32|
|01-01-2011 03:00| 1| 0| 0| 1| 9.84|14.395| 75| 0.0| 3| 10| 13|
|01-01-2011 04:00| 1| 0| 0| 1| 9.84|14.395| 75| 0.0| 0| 1| 1|
|01-01-2011 05:00| 1| 0| 0| 2| 9.84| 12.88| 75| 6.0032| 0| 1| 1|
|01-01-2011 06:00| 1| 0| 0| 1| 9.02|13.635| 80| 0.0| 2| 0| 2|
|01-01-2011 07:00| 1| 0| 0| 1| 8.2| 12.88| 86| 0.0| 1| 2| 3|
|01-01-2011 08:00| 1| 0| 0| 1| 9.84|14.395| 75| 0.0| 1| 7| 8|
|01-01-2011 09:00| 1| 0| 0| 1|13.12|17.425| 76| 0.0| 8| 6| 14|
+----------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
only showing top 10 rows
trainDF: org.apache.spark.sql.DataFrame = [datetime: string, season: int ... 10 more fields]
root
|-- datetime: string (nullable = true)
|-- season: integer (nullable = true)
|-- holiday: integer (nullable = true)
|-- workingday: integer (nullable = true)
|-- weather: integer (nullable = true)
|-- temp: double (nullable = true)
|-- atemp: double (nullable = true)
|-- humidity: integer (nullable = true)
|-- windspeed: double (nullable = true)
|-- casual: integer (nullable = true)
|-- registered: integer (nullable = true)
|-- count: integer (nullable = true)
exprs: scala.collection.immutable.Map[String,String] = Map(workingday -> approx_count_distinct, windspeed -> approx_count_distinct, registered -> approx_count_distinct, count -> approx_count_distinct, atemp -> approx_count_distinct, season -> approx_count_distinct, casual -> approx_count_distinct, humidity -> approx_count_distinct, temp -> approx_count_distinct, holiday -> approx_count_distinct, weather -> approx_count_distinct)
command-2871561650106197:2: warning: class OneHotEncoder in package feature is deprecated: `OneHotEncoderEstimator` will be renamed `OneHotEncoder` and this `OneHotEncoder` will be removed in 3.0.0.
val indexer = Array("season","weather").map(c=>new OneHotEncoder().setInputCol(c).setOutputCol(c + "_Vec"))
^
indexer: Array[org.apache.spark.ml.feature.OneHotEncoder] = Array(oneHot_8f2a7b7ac2df, oneHot_8b19ed4e6d57)
pipeline: org.apache.spark.ml.Pipeline = pipeline_34ad2132a3f2
df_r: org.apache.spark.sql.DataFrame = [datetime: string, holiday: int ... 10 more fields]
+----------------+-------+----------+----+------+--------+---------+------+----------+-----+-------------+-------------+
| datetime|holiday|workingday|temp| atemp|humidity|windspeed|casual|registered|count| season_Vec| weather_Vec|
+----------------+-------+----------+----+------+--------+---------+------+----------+-----+-------------+-------------+
|01-01-2011 00:00| 0| 0|9.84|14.395| 81| 0.0| 3| 13| 16|(4,[1],[1.0])|(4,[1],[1.0])|
|01-01-2011 01:00| 0| 0|9.02|13.635| 80| 0.0| 8| 32| 40|(4,[1],[1.0])|(4,[1],[1.0])|
|01-01-2011 02:00| 0| 0|9.02|13.635| 80| 0.0| 5| 27| 32|(4,[1],[1.0])|(4,[1],[1.0])|
|01-01-2011 03:00| 0| 0|9.84|14.395| 75| 0.0| 3| 10| 13|(4,[1],[1.0])|(4,[1],[1.0])|
|01-01-2011 04:00| 0| 0|9.84|14.395| 75| 0.0| 0| 1| 1|(4,[1],[1.0])|(4,[1],[1.0])|
+----------------+-------+----------+----+------+--------+---------+------+----------+-----+-------------+-------------+
only showing top 5 rows
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+
|datetime|season|holiday|workingday|weather|temp|atemp|humidity|windspeed|casual|registered|count|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+
| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+
Linear Regression Root Mean Squared Error (RMSE) on train_test data = 136.5774373860326
lr: org.apache.spark.ml.regression.LinearRegression = linReg_1ea40cad23bb
pipeline: org.apache.spark.ml.Pipeline = pipeline_f2ddbe3d23c6
lrModel: org.apache.spark.ml.PipelineModel = pipeline_f2ddbe3d23c6
predictions: org.apache.spark.sql.DataFrame = [datetime: timestamp, holiday: int ... 17 more fields]
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_2f1ee7361559
rmse: Double = 136.5774373860326
GBT Regressor Root Mean Squared Error (RMSE) on train_test data = 59.3978579039326
gbt: org.apache.spark.ml.regression.GBTRegressor = gbtr_e579397772d8
pipeline: org.apache.spark.ml.Pipeline = pipeline_e9de176b1b8e
gbtModel: org.apache.spark.ml.PipelineModel = pipeline_e9de176b1b8e
predictions: org.apache.spark.sql.DataFrame = [datetime: timestamp, holiday: int ... 17 more fields]
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_0359461a0553
rmse: Double = 59.3978579039326
Decision Tree Regressor Root Mean Squared Error (RMSE) on train_test data = 105.71823914900725
dt: org.apache.spark.ml.regression.DecisionTreeRegressor = dtr_c6a610ffbe6f
pipeline: org.apache.spark.ml.Pipeline = pipeline_88e50475cb10
dtModel: org.apache.spark.ml.PipelineModel = pipeline_88e50475cb10
predictions: org.apache.spark.sql.DataFrame = [datetime: timestamp, holiday: int ... 17 more fields]
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_eb9af058ad63
rmse: Double = 105.71823914900725
Random Forest Regressor Root Mean Squared Error (RMSE) on train_test data = 108.49954488266827
rf: org.apache.spark.ml.regression.RandomForestRegressor = rfr_ab681154d918
pipeline: org.apache.spark.ml.Pipeline = pipeline_b61910bcfa69
rfModel: org.apache.spark.ml.PipelineModel = pipeline_b61910bcfa69
predictions: org.apache.spark.sql.DataFrame = [datetime: timestamp, holiday: int ... 17 more fields]
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_607a2e7dc8bf
rmse: Double = 108.49954488266827
No comments