# importing libraries from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # loading file and checking error if sc.version >= '2.0': # Spark 2.0+ includes CSV as a native Spark SQL datasource. df = sqlContext.read.format('csv').option("header", 'true').load("/FileStore/tables/e43cbfn61493236024395/train.csv") else: # Earlier Spark versions can use the Spark CSV package df = sqlContext.read.format('com.databricks.spark.csv').option("header", 'true').load("//FileStore/tables/e43cbfn61493236024395/train.csv") df.cache() display(df) # count rows print "Our dataset has %d rows." % df.count() # dropping attributes df = df.drop("Name").drop("Ticket").drop("Cabin") display(df) # To display scheme df.printSchema() display(df) # The following call takes all columns (df.columns) and casts them using Spark SQL to a numeric type (DoubleType). from pyspark.sql.functions import col # for indicating a column using a string in the line below df = df.select([col(c).cast("double").alias(c) for c in df.columns]) df.printSchema() *) Now that we have set up our workflow, we can train the Pipeline in a single call. Calling fit() will run feature processing, model tuning, and training in a single call. We get back a fitted Pipeline with the best model found. pipelineModel = pipeline.fit(train) *) Our final step will be to use our fitted model to make predictions on new data. We will use our held-out test set, but you could also use this model to make predictions on completely new data. For example, if we created some features data based on weather predictions for the next week, we could predict bike rentals expected during the next week! We will also evaluate our predictions. Computing evaluation metrics is important for understanding the quality of predictions, as well as for comparing models and tuning parameters. Calling transform() on a new dataset passes that data through feature processing and uses the fitted model to make predictions. We get back a DataFrame with a new column predictions (as well as intermediate results such as our rawFeatures column from feature processing). predictions = pipelineModel.transform(test) *) • PassengerID: the true count of passengers • prediction: our predicted count of passenger survival. *) display(predictions.select("PassengerID", "prediction", *featuresCols))