Bootstrap

Spark Streaming整合kafka(2)

  • KafkaUtils.createDstream方式(基于kafka高级Api—–偏移量由zk保存)
    import org.apache.spark.{SparkConf, SparkContext}
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
    import org.apache.spark.streaming.kafka.KafkaUtils

    import scala.collection.immutable


    //todo:利用sparkStreaming对接kafka实现单词计数----采用receiver(高级API)
    object SparkStreamingKafka_Receiver {
      def main(args: Array[String]): Unit = {
          //1、创建sparkConf
          val sparkConf: SparkConf = new SparkConf()
            .setAppName("SparkStreamingKafka_Receiver")
            .setMaster("local[4]")//开启了3个reciver需要3个相应线程外加1个以上的计算线程
            .set("spark.streaming.receiver.writeAheadLog.enable","true") //开启wal预写日志,保存数据源的可靠性
          //2、创建sparkContext
          val sc = new SparkContext(sparkConf)
          sc.setLogLevel("WARN")
          //3、创建StreamingContext
          val ssc = new StreamingContext(sc,Seconds(5))

        //设置checkpoint
          ssc.checkpoint("./Kafka_Receiver")

        //4、定义zk地址
        val zkQuorum="node1:2181,node2:2181,node3:2181"
        //5、定义消费者组
        val groupId="spark_receiver"
        //6、定义topic相关信息 Map[String, Int]
        // 这里的value并不是topic分区数,它表示的topic中每一个分区被N个线程消费
        val topics=Map("spark_topic1" -> 2)

        //7、通过KafkaUtils.createStream对接kafka
        //这个时候相当于同时开启3个receiver接受数据
        val receiverDstream: immutable.IndexedSeq[ReceiverInputDStream[(String, String)]] = (1 to 3).map(x => {
          val stream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics)
          stream
          }
        )
        //使用ssc.union方法合并所有的receiver中的数据
          val unionDStream: DStream[(String, String)] = ssc.union(receiverDstream)

        //8、获取topic中的数据
        val topicData: DStream[String] = unionDStream.map(_._2)
        //9、切分每一行,每个单词计为1
        val wordAndOne: DStream[(String, Int)] = topicData.flatMap(_.split(" ")).map((_,1))
        //10、相同单词出现的次数累加
        val result: DStream[(String, Int)] = wordAndOne.reduceByKey(_+_)
        //11、打印输出
        result.print()

        //开启计算
        ssc.start()
        ssc.awaitTermination()
      }

    }
  • KafkaUtils.createDirectStream方式(基于kafka低级Api—–偏移量由客户端程序保存)
    import kafka.serializer.StringDecoder
    import org.apache.spark.{SparkConf, SparkContext}
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.dstream.{DStream, InputDStream}
    import org.apache.spark.streaming.kafka.KafkaUtils

    //todo:利用sparkStreaming对接kafka实现单词计数----采用Direct(低级API)
    object SparkStreamingKafka_Direct {
        def main(args: Array[String]): Unit = {
          //1、创建sparkConf
          val sparkConf: SparkConf = new SparkConf()
            .setAppName("SparkStreamingKafka_Direct")
            .setMaster("local[2]")
          //2、创建sparkContext
          val sc = new SparkContext(sparkConf)
          sc.setLogLevel("WARN")
          //3、创建StreamingContext
          val ssc = new StreamingContext(sc,Seconds(5))
          //4、配置kafka相关参数
          val kafkaParams=Map("metadata.broker.list"->"node1:9092,node2:9092,node3:9092","group.id"->"Kafka_Direct")
          //5、定义topic
          val topics=Set("spark_topic1")
          //6、通过 KafkaUtils.createDirectStream接受kafka数据,这里采用是kafka低级api偏移量不受zk管理
          val dstream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
          //7、获取kafka中topic中的数据
            val topicData: DStream[String] = dstream.map(_._2)
          //8、切分每一行,每个单词计为1
          val wordAndOne: DStream[(String, Int)] = topicData.flatMap(_.split(" ")).map((_,1))
          //9、相同单词出现的次数累加
          val result: DStream[(String, Int)] = wordAndOne.reduceByKey(_+_)
          //10、打印输出
          result.print()

          //开启计算
          ssc.start()
          ssc.awaitTermination()
      }
    }
  • KafkaUtils.createDirectStream方式sparkStreaming对接kafka实现单词计数—-采用Direct(低级API通过checkPoint恢复StreamingContext)
    import kafka.serializer.StringDecoder
    import org.apache.spark.{SparkConf, SparkContext}
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.dstream.{DStream, InputDStream}
    import org.apache.spark.streaming.kafka.KafkaUtils

    //todo:利用sparkStreaing对接kafka数据,实现单词计算----------通过checkpint目录来构建StreamingContext
    object SparkStreamingKafka_checkpoint {

      //获取StreamingContext
      def createFunc(checkPointPath: String):StreamingContext = {
        //1、创建sparkConf
        val sparkConf: SparkConf = new SparkConf()
          .setAppName("SparkStreamingKafka_checkpoint")
          .setMaster("local[2]")
        //2、创建sparkContext
        val sc = new SparkContext(sparkConf)
        sc.setLogLevel("WARN")
        //3、创建StreamingContext
        val ssc = new StreamingContext(sc,Seconds(5))
        ssc.checkpoint(checkPointPath)

        //4、配置kafka相关参数
        val kafkaParams=Map("metadata.broker.list"->"node1:9092,node2:9092,node3:9092","group.id"->"Kafka_Direct")
        //5、定义topic
        val topics=Set("spark_topic1")
        //6、通过 KafkaUtils.createDirectStream接受kafka数据,这里采用是kafka低级api偏移量不受zk管理
        val dstream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
        //7、获取kafka中topic中的数据
        val topicData: DStream[String] = dstream.map(_._2)
        //8、切分每一行,每个单词计为1
        val wordAndOne: DStream[(String, Int)] = topicData.flatMap(_.split(" ")).map((_,1))
        //9、相同单词出现的次数累加
        val result: DStream[(String, Int)] = wordAndOne.reduceByKey(_+_)
        result.checkpoint(Seconds(5))
        //10、打印输出
        result.print()

        ssc
      }

      def main(args: Array[String]): Unit = {
        //定义checkpoint目录
        val checkPointPath="./kafka_checkpoint"

        //StreamingContext.getOrCreate
        //如果设置了checkpoint目录并且里面数据没有问题,可以从checkpoint目录中构建一个StreamingContext,
        //如果没有设置checkpoint目录,他会重新构建
        val ssc: StreamingContext = StreamingContext.getOrCreate(checkPointPath, () => {
          createFunc(checkPointPath)
        })

        //开启任务
          ssc.start()
          ssc.awaitTermination()
      }
    }
;