pom依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<!--2.12是scala的版本,3.0.0是spark的版本-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
<dependency>
<groupId>com.atguigu</groupId>
<artifactId>summer-framework</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.10.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/druid -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.10</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 该插件用于将Scala代码编译成class文件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<!-- 声明绑定到maven的compile阶段 -->
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
消费kafka的数据
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object ReceiveKafaData {
def main(args: Array[String]): Unit = {
// TODO 消费Kafka中指定Topic数据
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamingWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(3))
// TODO Kafka一般就是用于实时数据传输,所以在SparkStreaming的操作过程中,
// 可以使用工具类
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "linux1:9092,linux2:9092,linux3:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu0317",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
// 通过工具类访问Kafka,传递Topic和连接配置即可
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("atguigu0317"), kafkaPara)
)
// 因为kafka传递数据是基于KV键值对的数据,所以这里(String, String)其实就是kv键值对
// 因为一般传递数据时,key不传的,所以key为null,获取数据时,一般只需要value即可
kafkaDStream.map(_.value()).print()
ssc.start()
ssc.awaitTermination()
}
}
需求一:广告黑名单
1.1
import java.sql.{DriverManager, ResultSet}
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import scala.collection.mutable.ListBuffer
object Req1_BlackList {
def main(args: Array[String]): Unit = {
// TODO 需求:将每天对某个广告点击超过 100 次的用户拉黑
// TODO 消费Kafka中指定Topic数据
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamingBlackList")
val ssc = new StreamingContext(sparkConf, Seconds(3))
// TODO Kafka一般就是用于实时数据传输,所以在SparkStreaming的操作过程中,
// 可以使用工具类
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "linux1:9092,linux2:9092,linux3:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu0317",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("atguigu0317"), kafkaPara)
)
// TODO 1. 读取Kafka中广告点击的数据
// 将数据转换类来使用
val clickDStream: DStream[AdClickData] = kafkaDStream.map(
record => {
val data = record.value()
val datas = data.split(" ")
AdClickData( datas(0), datas(1), datas(2), datas(3), datas(4) )
}
)
// TODO 2. 周期性读取用户黑名单数据,将黑名单数据中的用户的数据过滤掉
// 对正常广告点击的数据进行统计
val aggregateDStream = clickDStream.transform(
rdd => {
// TODO 读取Mysql数据,获取黑名单信息
val driverClass = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://linux1:3306/spark0317"
val user = "root"
val password = "123123"
Class.forName(driverClass)
val conn = DriverManager.getConnection(url, user, password)
val pstat = conn.prepareStatement(
"""
| select userid from black_list
""".stripMargin
)
val rs : ResultSet = pstat.executeQuery()
val blackids = ListBuffer[String]()
while ( rs.next() ) {
blackids.append(rs.getString(1))
}
rs.close()
pstat.close
conn.close()
// TODO 对采集的数据进行过滤,将黑名单中的数据过滤掉
val filterRDD = rdd.filter(
data => {
!blackids.contains(data.userid)
}
)
// TODO 对过滤后的数据进行统计
// (word, count) => ( day-userid-adid, count) => ( day-userid-adid, sum)
val sdf = new SimpleDateFormat("yyyy-MM-dd")
filterRDD.map(
data => {
// TODO 因为统计是基于日期进行,所以需要将数据格式转换为以天为单位
// TS => Date
val date = new java.util.Date(data.ts.toLong)
( ( sdf.format(date), data.userid, data.adid ), 1 )
}
).reduceByKey(_+_)
}
)
// TODO 3. 将统计结果和用户的历史数据进行合并
// 对结果进行判断,如果结果超过指定的阈值,将这个用户拉入黑名单
aggregateDStream.foreachRDD(
rdd => {
rdd.foreach {
case ((day, userid, adid), sum) => {
println(s"处理的数据为 : ( ${day} ${userid} ${adid} ${sum} )")
val driverClass = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://linux1:3306/spark0317"
val user = "root"
val password = "123123"
Class.forName(driverClass)
val conn = DriverManager.getConnection(url, user, password)
// TODO 获取用户当天广告的点击数量
val pstat = conn.prepareStatement(
"""
|select
| count
|from user_ad_count
|where dt = ? and userid = ? and adid = ?
""".stripMargin)
pstat.setString(1, day)
pstat.setString(2, userid)
pstat.setString(3, adid)
val rs: ResultSet = pstat.executeQuery()
if (rs.next()) {
// 用户信息存在
val count = rs.getLong(1)
// TODO 对当前用户的点击数量进行更新
val newCount = count + sum
// TODO 如果更新后的数据值超过的阈值
if ( newCount >= 30 ) {
// TODO 将用户拉入到黑名单中
val pstat1 = conn.prepareStatement(
"""
|insert into black_list (userid) values (?)
""".stripMargin)
pstat1.setString(1, userid)
pstat1.executeUpdate()
pstat1.close()
} else {
// TODO 更新数据
val pstat2 = conn.prepareStatement(
"""
|update user_ad_count
|set count = ?
|where dt = ? and userid = ? and adid = ?
""".stripMargin)
pstat2.setLong(1, newCount)
pstat2.setString(2, day)
pstat2.setString(3, userid)
pstat2.setString(4, adid)
pstat2.executeUpdate()
pstat2.close()
}
} else {
// 用户信息不存在
// TODO 如果聚合后的数据值超过的阈值
if ( sum >= 30 ) {
// TODO 将用户拉入到黑名单中
// 如果有重复的数据,需要更新
val pstat1 = conn.prepareStatement(
"""
|insert into black_list (userid) values (?)
|on duplicate key
|update userid = ?
""".stripMargin)
pstat1.setString(1, userid)
pstat1.setString(2, userid)
pstat1.executeUpdate()
pstat1.close()
} else {
// TODO 将当前的用户信息插入到表中
val pstat2 = conn.prepareStatement(
"""
|insert into user_ad_count (dt, userid, adid, count) values (?, ?, ?, ?)
""".stripMargin)
pstat2.setString(1, day)
pstat2.setString(2, userid)
pstat2.setString(3, adid)
pstat2.setLong(4, sum)
pstat2.executeUpdate()
pstat2.close()
}
}
rs.close()
conn.close()
}
}
}
)
ssc.start()
ssc.awaitTermination()
}
case class AdClickData( ts : String, area:String, city:String, userid:String, adid:String )
}
需求一:广告黑名单
1.2
import java.sql.{DriverManager, ResultSet}
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
object Req1_BlackList2 {
def main(args: Array[String]): Unit = {
// TODO 需求:将每天对某个广告点击超过 100 次的用户拉黑
// TODO 消费Kafka中指定Topic数据
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamingBlackList")
val ssc = new StreamingContext(sparkConf, Seconds(3))
// TODO Kafka一般就是用于实时数据传输,所以在SparkStreaming的操作过程中,
// 可以使用工具类
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "linux1:9092,linux2:9092,linux3:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu0317",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("atguigu0317"), kafkaPara)
)
// TODO 1. 读取Kafka中广告点击的数据
// 将数据转换类来使用
val clickDStream: DStream[AdClickData] = kafkaDStream.map(
record => {
val data = record.value()
val datas = data.split(" ")
AdClickData( datas(0), datas(1), datas(2), datas(3), datas(4) )
}
)
// TODO 2. 周期性读取用户黑名单数据,将黑名单数据中的用户的数据过滤掉
// 对正常广告点击的数据进行统计
val aggregateDStream = clickDStream.transform(
rdd => {
// TODO 读取Mysql数据,获取黑名单信息
val driverClass = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://linux1:3306/spark0317"
val user = "root"
val password = "123123"
Class.forName(driverClass)
val conn = DriverManager.getConnection(url, user, password)
val pstat = conn.prepareStatement(
"""
| select userid from black_list
""".stripMargin
)
val rs : ResultSet = pstat.executeQuery()
val blackids = ListBuffer[String]()
while ( rs.next() ) {
blackids.append(rs.getString(1))
}
rs.close()
pstat.close
conn.close()
// TODO 对采集的数据进行过滤,将黑名单中的数据过滤掉
val filterRDD = rdd.filter(
data => {
!blackids.contains(data.userid)
}
)
// TODO 对过滤后的数据进行统计
// (word, count) => ( day-userid-adid, count) => ( day-userid-adid, sum)
val sdf = new SimpleDateFormat("yyyy-MM-dd")
filterRDD.map(
data => {
// TODO 因为统计是基于日期进行,所以需要将数据格式转换为以天为单位
// TS => Date
val date = new java.util.Date(data.ts.toLong)
( ( sdf.format(date), data.userid, data.adid ), 1 )
}
).reduceByKey(_+_)
}
)
// TODO 3. 将统计结果和用户的历史数据进行合并
// 对结果进行判断,如果结果超过指定的阈值,将这个用户拉入黑名单
aggregateDStream.foreachRDD(
rdd => {
rdd.foreach {
case ((day, userid, adid), sum) => {
println(s"处理的数据为 : ( ${day} ${userid} ${adid} ${sum} )")
val driverClass = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://linux1:3306/spark0317"
val user = "root"
val password = "123123"
Class.forName(driverClass)
val conn = DriverManager.getConnection(url, user, password)
// TODO 更新(插入)用户点击统计表
val pstat = conn.prepareStatement(
"""
| insert into user_ad_count( dt, userid, adid, count )
| values (?, ?, ?, ?)
| on duplicate key
| update count = count + ?
""".stripMargin)
// TODO 判断是否超过阈值,加入黑名单
val pstat1 = conn.prepareStatement(
"""
| select
| userid
| from user_ad_count
| where dt = ? and userid = ? and adid = ? and count >= 30
""".stripMargin)
val pstat2 = conn.prepareStatement(
"""
| insert into black_list(userid)
| values (?)
| on duplicate key
| update userid = ?
""".stripMargin)
// TODO 更新统计数据
pstat.setString(1, day)
pstat.setString(2, userid)
pstat.setString(3, adid)
pstat.setLong(4, sum)
pstat.setLong(5, sum)
pstat.executeUpdate()
// TODO 查询超过阈值的数据
pstat1.setString(1, day)
pstat1.setString(2, userid)
pstat1.setString(3, adid)
val rs: ResultSet = pstat1.executeQuery()
if ( rs.next() ) {
// TODO 插入黑名单
pstat2.setString(1, userid)
pstat2.setString(2, userid)
pstat2.executeUpdate()
}
rs.close()
pstat.close()
pstat1.close()
pstat2.close()
conn.close()
}
}
}
)
ssc.start()
ssc.awaitTermination()
}
case class AdClickData( ts : String, area:String, city:String, userid:String, adid:String )
}
需求一:广告黑名单
1.3
import java.sql.{DriverManager, ResultSet}
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
object Req1_BlackList3 {
def main(args: Array[String]): Unit = {
// TODO 需求:将每天对某个广告点击超过 100 次的用户拉黑
// TODO 消费Kafka中指定Topic数据
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamingBlackList")
val ssc = new StreamingContext(sparkConf, Seconds(3))
// TODO Kafka一般就是用于实时数据传输,所以在SparkStreaming的操作过程中,
// 可以使用工具类
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "linux1:9092,linux2:9092,linux3:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu0317",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("atguigu0317"), kafkaPara)
)
// TODO 1. 读取Kafka中广告点击的数据
// 将数据转换类来使用
val clickDStream: DStream[AdClickData] = kafkaDStream.map(
record => {
val data = record.value()
val datas = data.split(" ")
AdClickData( datas(0), datas(1), datas(2), datas(3), datas(4) )
}
)
// TODO 2. 周期性读取用户黑名单数据,将黑名单数据中的用户的数据过滤掉
// 对正常广告点击的数据进行统计
// 在数据处理过程中,需要将数据的统计结果和Mysql进行交互
// 如果频繁的创建数据库连接会导致性能下降,并且可能出现异常
// 1. 使用连接池优化连接操作
// 2. 如果数据量非常大,那么一条数据就使用一个连接,依然会导致连接池承受不了。
val aggregateDStream = clickDStream.transform(
rdd => {
// TODO 读取Mysql数据,获取黑名单信息
val conn = JDBCUtil.getConnection
val pstat = conn.prepareStatement(
"""
| select userid from black_list
""".stripMargin
)
val rs : ResultSet = pstat.executeQuery()
val blackids = ListBuffer[String]()
while ( rs.next() ) {
blackids.append(rs.getString(1))
}
rs.close()
pstat.close
conn.close()
// TODO 对采集的数据进行过滤,将黑名单中的数据过滤掉
val filterRDD = rdd.filter(
data => {
!blackids.contains(data.userid)
}
)
// TODO 对过滤后的数据进行统计
// (word, count) => ( day-userid-adid, count) => ( day-userid-adid, sum)
val sdf = new SimpleDateFormat("yyyy-MM-dd")
filterRDD.map(
data => {
// TODO 因为统计是基于日期进行,所以需要将数据格式转换为以天为单位
// TS => Date
val date = new java.util.Date(data.ts.toLong)
( ( sdf.format(date), data.userid, data.adid ), 1 )
}
).reduceByKey(_+_)
}
)
// TODO 3. 将统计结果和用户的历史数据进行合并
// 对结果进行判断,如果结果超过指定的阈值,将这个用户拉入黑名单
aggregateDStream.foreachRDD(
rdd => {
// TODO Coding
// foreach : 算子内部如果使用了算子外部的对象,那么这个对象必须能序列化
// 数据库连接对象无法序列化
rdd.foreach {
case ((day, userid, adid), sum) => {
println(s"处理的数据为 : ( ${day} ${userid} ${adid} ${sum} )")
val conn = JDBCUtil.getConnection
// TODO 更新(插入)用户点击统计表
val pstat = conn.prepareStatement(
"""
| insert into user_ad_count( dt, userid, adid, count )
| values (?, ?, ?, ?)
| on duplicate key
| update count = count + ?
""".stripMargin)
// TODO 判断是否超过阈值,加入黑名单
val pstat1 = conn.prepareStatement(
"""
| select
| userid
| from user_ad_count
| where dt = ? and userid = ? and adid = ? and count >= 30
""".stripMargin)
val pstat2 = conn.prepareStatement(
"""
| insert into black_list(userid)
| values (?)
| on duplicate key
| update userid = ?
""".stripMargin)
// TODO 更新统计数据
pstat.setString(1, day)
pstat.setString(2, userid)
pstat.setString(3, adid)
pstat.setLong(4, sum)
pstat.setLong(5, sum)
pstat.executeUpdate()
// TODO 查询超过阈值的数据
pstat1.setString(1, day)
pstat1.setString(2, userid)
pstat1.setString(3, adid)
val rs: ResultSet = pstat1.executeQuery()
if ( rs.next() ) {
// TODO 插入黑名单
pstat2.setString(1, userid)
pstat2.setString(2, userid)
pstat2.executeUpdate()
}
rs.close()
pstat.close()
pstat1.close()
pstat2.close()
conn.close()
}
}
}
)
ssc.start()
ssc.awaitTermination()
}
case class AdClickData( ts : String, area:String, city:String, userid:String, adid:String )
}
需求一:广告黑名单
1.4
import java.sql.ResultSet
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
object Req1_BlackList4 {
def main(args: Array[String]): Unit = {
// TODO 需求:将每天对某个广告点击超过 100 次的用户拉黑
// TODO 消费Kafka中指定Topic数据
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamingBlackList")
val ssc = new StreamingContext(sparkConf, Seconds(3))
// TODO Kafka一般就是用于实时数据传输,所以在SparkStreaming的操作过程中,
// 可以使用工具类
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "linux1:9092,linux2:9092,linux3:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu0317",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("atguigu0317"), kafkaPara)
)
// TODO 1. 读取Kafka中广告点击的数据
// 将数据转换类来使用
val clickDStream: DStream[AdClickData] = kafkaDStream.map(
record => {
val data = record.value()
val datas = data.split(" ")
AdClickData( datas(0), datas(1), datas(2), datas(3), datas(4) )
}
)
// TODO 2. 周期性读取用户黑名单数据,将黑名单数据中的用户的数据过滤掉
// 对正常广告点击的数据进行统计
// 在数据处理过程中,需要将数据的统计结果和Mysql进行交互
// 如果频繁的创建数据库连接会导致性能下降,并且可能出现异常
// 1. 使用连接池优化连接操作
// 2. 如果数据量非常大,那么一条数据就使用一个连接,依然会导致连接池承受不了。
// 可以采用一个分区一个连接对象,而不是一个数据一个连接
val aggregateDStream = clickDStream.transform(
rdd => {
// TODO 读取Mysql数据,获取黑名单信息
val conn = JDBCUtil.getConnection
val pstat = conn.prepareStatement(
"""
| select userid from black_list
""".stripMargin
)
val rs : ResultSet = pstat.executeQuery()
val blackids = ListBuffer[String]()
while ( rs.next() ) {
blackids.append(rs.getString(1))
}
rs.close()
pstat.close
conn.close()
// TODO 对采集的数据进行过滤,将黑名单中的数据过滤掉
val filterRDD = rdd.filter(
data => {
!blackids.contains(data.userid)
}
)
// TODO 对过滤后的数据进行统计
// (word, count) => ( day-userid-adid, count) => ( day-userid-adid, sum)
val sdf = new SimpleDateFormat("yyyy-MM-dd")
filterRDD.map(
data => {
// TODO 因为统计是基于日期进行,所以需要将数据格式转换为以天为单位
// TS => Date
val date = new java.util.Date(data.ts.toLong)
( ( sdf.format(date), data.userid, data.adid ), 1 )
}
).reduceByKey(_+_)
}
)
// TODO 3. 将统计结果和用户的历史数据进行合并
// 对结果进行判断,如果结果超过指定的阈值,将这个用户拉入黑名单
aggregateDStream.foreachRDD(
rdd => {
// TODO Coding
// foreach : 算子内部如果使用了算子外部的对象,那么这个对象必须能序列化
// 数据库连接对象无法序列化
// map & mapPartitions
// foreach & foreachPartition
rdd.foreachPartition(
datas => {
val conn = JDBCUtil.getConnection
// TODO 更新(插入)用户点击统计表
val pstat = conn.prepareStatement(
"""
| insert into user_ad_count( dt, userid, adid, count )
| values (?, ?, ?, ?)
| on duplicate key
| update count = count + ?
""".stripMargin)
// TODO 判断是否超过阈值,加入黑名单
val pstat1 = conn.prepareStatement(
"""
| select
| userid
| from user_ad_count
| where dt = ? and userid = ? and adid = ? and count >= 30
""".stripMargin)
val pstat2 = conn.prepareStatement(
"""
| insert into black_list(userid)
| values (?)
| on duplicate key
| update userid = ?
""".stripMargin)
// TODO 循环分区数据
datas.foreach{
case ((day, userid, adid), sum) => {
// TODO 更新统计数据
pstat.setString(1, day)
pstat.setString(2, userid)
pstat.setString(3, adid)
pstat.setLong(4, sum)
pstat.setLong(5, sum)
pstat.executeUpdate()
// TODO 查询超过阈值的数据
pstat1.setString(1, day)
pstat1.setString(2, userid)
pstat1.setString(3, adid)
val rs: ResultSet = pstat1.executeQuery()
if ( rs.next() ) {
// TODO 插入黑名单
pstat2.setString(1, userid)
pstat2.setString(2, userid)
pstat2.executeUpdate()
}
}
}
pstat.close()
pstat1.close()
pstat2.close()
conn.close()
}
)
}
)
ssc.start()
ssc.awaitTermination()
}
case class AdClickData( ts : String, area:String, city:String, userid:String, adid:String )
}
需求二:广告点击量实时统计
import java.sql.{Connection, ResultSet}
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
object Req2_DateAreaCityAdAnalysis {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamingBlackList")
val ssc = new StreamingContext(sparkConf, Seconds(3))
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "linux1:9092,linux2:9092,linux3:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu0317",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("atguigu0317"), kafkaPara)
)
// TODO 1. 读取Kafka中广告点击的数据
// 将数据转换类来使用
val clickDStream: DStream[AdClickData] = kafkaDStream.map(
record => {
val data = record.value()
val datas = data.split(" ")
AdClickData( datas(0), datas(1), datas(2), datas(3), datas(4) )
}
)
val sdf = new SimpleDateFormat("yyyy-MM-dd")
val mapDStream = clickDStream.map(
data => {
val day = sdf.format(new java.util.Date( data.ts.toLong ))
(( day, data.area, data.city, data.adid ), 1)
}
)
// 有状态操作 -
val reduceDS = mapDStream.reduceByKey(_+_)
// TODO 将统计的结果保存到MySQL中
reduceDS.foreachRDD(
rdd => {
// DStream.map : 原语 => 方法
// rdd.foreach : 算子 => 分布式概念
// collection.foreach : 方法 => 内存中数据处理(单点)
rdd.foreachPartition(
datas => {
val conn: Connection = JDBCUtil.getConnection
// 预编译
val pstat = conn.prepareStatement(
"""
| insert into area_city_ad_count(dt, area, city, adid, count)
| values ( ?, ?, ?, ?, ? )
| on duplicate key
| update count = count + ?
""".stripMargin)
datas.foreach{
case ((dt, area, city, adid), count) => {
pstat.setString(1, dt)
pstat.setString(2, area)
pstat.setString(3, city)
pstat.setString(4, adid)
pstat.setLong(5, count)
pstat.setLong(6, count)
pstat.executeUpdate()
}
}
pstat.close()
conn.close()
}
)
}
)
ssc.start()
ssc.awaitTermination()
}
case class AdClickData( ts : String, area:String, city:String, userid:String, adid:String )
}
需求三:最近一小时广告点击量
import java.sql.Connection
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
object Req3_LastHourAdClickAnalysis {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamingBlackList")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "linux1:9092,linux2:9092,linux3:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu0317",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("atguigu0317"), kafkaPara)
)
// TODO 1. 读取Kafka中广告点击的数据
// 将数据转换类来使用
val clickDStream: DStream[AdClickData] = kafkaDStream.map(
record => {
val data = record.value()
val datas = data.split(" ")
AdClickData( datas(0), datas(1), datas(2), datas(3), datas(4) )
}
)
// TODO 将数据进行结构的转换,便于统计分析
// 1分钟,每10秒统计结果
// 12:01 => 12:00
// 12:09 => 12:00
// 12:31 => 12:30
// 12:59 => 12:50
// ts => long => 1000 => 00
// ts => long => 9000 => 00
// ts => long => 31000 => 30
// ts => long => 59000 => 50
// long / 10000 * 10000 = 50000
//((A, 12:01), 1)
//((A, 12:02), 1)
//((A, 12:09), 1)
// ==> ( (A, 12:10), 2 )
// ==> ( (A, 12:00), 3 )
// ==> ( (A, 12:20), 1 )
val mapDS = clickDStream.map(
data => {
(( data.adid, data.ts.toLong / 10000 * 10000 ), 1)
}
)
// shuffle : 打乱重新组合
val reduceDS = mapDS.reduceByKeyAndWindow(
(x:Int, y:Int) => x + y,
Minutes(1),
Seconds(10)
)
// TODO 1. 统计完成后将结果直接保存到Mysql
// 数据库的数据是没有顺序。
// 大屏展示数据时,查询排序
// TODO 2. 统计完成后的将结果排序后保存到Mysql中
// 将数据结构进行转换,便于分组排序
val mapDS1 = reduceDS.map{
case ((adid, time), sum) => {
(adid, (time, sum))
}
}
val groupDS: DStream[(String, Iterable[(Long, Int)])] = mapDS1.groupByKey()
val resultDS = groupDS.mapValues(
iter => {
iter.toList.sortWith(
(left, right) => {
left._1 < right._1
}
)
}
)
// 当使用窗口计算时。
// 1. 判断数据计算的范围:(1个采集周期, 多个采集周期)
// 2. 多长时间进行计算(取决于滑动步长,默认是一个采集周期)
// 3. 窗口范围和步长的大小。(采集周期的整数倍)
// 4. 判断窗口范围和步长的关系
// 窗口范围大,步长小,存在大量的重复数据,所以需要使用增量计算
// 窗口范围大,步长大,存在重复数据比较少,可以独立计算。
// 窗口一般大小就是一个小时
resultDS.print()
ssc.start()
ssc.awaitTermination()
}
case class AdClickData( ts : String, area:String, city:String, userid:String, adid:String )
}