Bootstrap

201707舆情分析系统代码

import breeze.linalg
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, Word2Vec}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
//http://qkxue.net/info/28517/SparkML
//spark-shell --driver-class-path /home/hadoop/test/mysqljdbc.jar
object WbClassifier {

  def main(args: Array[String]) {


    val VECTOR_SIZE =500

    val conf = new SparkConf().setAppName("WEIBO MLPC Classification")

    val sc = new SparkContext(conf)

    val sqlCtx = new SQLContext(sc)

    val titlesplit1 = sqlCtx.jdbc("jdbc:mysql://192.168.0.37:3306/emotional?user=root&password=123456", "mltest")
    val titlesplit =titlesplit1.toDF().registerTempTable("mltest")
    val value =sqlCtx.sql("SELECT mltest.svalue,mltest.words  FROM mltest")
    //model
    val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/SMSSpamCollection.txt").map(_.split("\t")).map(eachRow => {
      (eachRow(0),eachRow(1).split(" "))
    })
//   val parsedRDD= value.map(p => {
//   val v0 = p.get(0).toString
//   val v1 = p.getString(1).split(",")
//    (v0, v1)
//})

   // val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/20170725.txt").map(line=>(line.split(" ")(3),line.split(" ")(2).split(",")))
//            val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/20170726.txt").map(_.split("\t")).map(eachRow => {
//              (eachRow(0),eachRow(1).split(" "))
//            })

   // http://doc.okbase.net/u013719780/archive/239004.html
    val msgDF = sqlCtx.createDataFrame(parsedRDD).toDF("label", "message")
    // 主成分分析


    //

    val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(msgDF)
    val word2Vec = new Word2Vec().setInputCol("message").setOutputCol("features").setVectorSize(VECTOR_SIZE).setMinCount(1)
    val layers = Array[Int](VECTOR_SIZE, 6,5,3)
    val multilayerPerceptronClassifier = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1024).setSeed(1234L).setMaxIter(456).setFeaturesCol("features").setLabelCol("indexedLabel").setPredictionCol("prediction")
    val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
    val Array(trainingData, testData) = msgDF.randomSplit(Array(0.8, 0.2))
    val pipeline = new Pipeline().setStages(Array(labelIndexer, word2Vec, multilayerPerceptronClassifier, labelConverter))
    val model = pipeline.fit(trainingData)
    val predictionResultDF = model.transform(testData)
     predictionResultDF.printSchema
    //predictionResultDF.select("message", "label","features", "predictedLabel").show(30)
    predictionResultDF.select("message","features","label","predictedLabel").show(30)
  //  predictionResultDF.select("message","features","label","predictedLabel").write.save("file:///logs")
    predictionResultDF.select("predictedLabel").distinct().take(5)
    val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
    val predictionAccuracy = evaluator.evaluate(predictionResultDF)
    println("Testing Accuracy is %2.4f".format(predictionAccuracy * 100) + "%")
    sc.stop

  }

}

 

;