import breeze.linalg import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{IndexToString, StringIndexer, Word2Vec} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD //http://qkxue.net/info/28517/SparkML //spark-shell --driver-class-path /home/hadoop/test/mysqljdbc.jar object WbClassifier { def main(args: Array[String]) { val VECTOR_SIZE =500 val conf = new SparkConf().setAppName("WEIBO MLPC Classification") val sc = new SparkContext(conf) val sqlCtx = new SQLContext(sc) val titlesplit1 = sqlCtx.jdbc("jdbc:mysql://192.168.0.37:3306/emotional?user=root&password=123456", "mltest") val titlesplit =titlesplit1.toDF().registerTempTable("mltest") val value =sqlCtx.sql("SELECT mltest.svalue,mltest.words FROM mltest") //model val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/SMSSpamCollection.txt").map(_.split("\t")).map(eachRow => { (eachRow(0),eachRow(1).split(" ")) }) // val parsedRDD= value.map(p => { // val v0 = p.get(0).toString // val v1 = p.getString(1).split(",") // (v0, v1) //}) // val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/20170725.txt").map(line=>(line.split(" ")(3),line.split(" ")(2).split(","))) // val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/20170726.txt").map(_.split("\t")).map(eachRow => { // (eachRow(0),eachRow(1).split(" ")) // }) // http://doc.okbase.net/u013719780/archive/239004.html val msgDF = sqlCtx.createDataFrame(parsedRDD).toDF("label", "message") // 主成分分析 // val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(msgDF) val word2Vec = new Word2Vec().setInputCol("message").setOutputCol("features").setVectorSize(VECTOR_SIZE).setMinCount(1) val layers = Array[Int](VECTOR_SIZE, 6,5,3) val multilayerPerceptronClassifier = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1024).setSeed(1234L).setMaxIter(456).setFeaturesCol("features").setLabelCol("indexedLabel").setPredictionCol("prediction") val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) val Array(trainingData, testData) = msgDF.randomSplit(Array(0.8, 0.2)) val pipeline = new Pipeline().setStages(Array(labelIndexer, word2Vec, multilayerPerceptronClassifier, labelConverter)) val model = pipeline.fit(trainingData) val predictionResultDF = model.transform(testData) predictionResultDF.printSchema //predictionResultDF.select("message", "label","features", "predictedLabel").show(30) predictionResultDF.select("message","features","label","predictedLabel").show(30) // predictionResultDF.select("message","features","label","predictedLabel").write.save("file:///logs") predictionResultDF.select("predictedLabel").distinct().take(5) val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision") val predictionAccuracy = evaluator.evaluate(predictionResultDF) println("Testing Accuracy is %2.4f".format(predictionAccuracy * 100) + "%") sc.stop } }