kafka服务端源代码分析之副本拉取Leader消息

1. AbstractFetcherThread

kafka每个分区的副本分为Leader和Follower两种角色，Follower副本会定期从Leader副本拉取数据。在kafka源码中是通过ReplicaFetcherThread类实现消息的拉取与处理，ReplicaFetcherThread继承自AbstractFetcherThread，因此首先分析AbstractFetcherThread。

abstract class AbstractFetcherThread(name: String,  // 线程名称
                                     clientId: String,  // Client Id，用于日志输出
                                     val sourceBroker: BrokerEndPoint,  // 数据源Broker地址
                                     fetchBackOffMs: Int = 0, // 获取操作重试间隔
                                     isInterruptible: Boolean = true) // 线程是否允许被中断
                                     extends ShutdownableThread(name, isInterruptible)
{...}

AbstractFetcherThread又继承自ShutdownableThread类

abstract class ShutdownableThread(val name: String, val isInterruptible: Boolean = true)
        extends Thread(name) with Logging {
 
  def doWork(): Unit

  override def run(): Unit = {
    info("Starting")
    try {
      while (isRunning)
        doWork()
    } catch {
      case e: FatalExitError =>
        shutdownInitiated.countDown()
        shutdownComplete.countDown()
        info("Stopped")
        Exit.exit(e.statusCode())
      case e: Throwable =>
        if (isRunning)
          error("Error due to", e)
    } finally {
       shutdownComplete.countDown()
    }
    info("Stopped")
  }
}

从ShutdownableThread的run方法中可以得出实际的执行体是子类AbstractFetcherThread的doWork方法。

override def doWork() {
    maybeTruncate()  // 执行副本截断操作
    maybeFetch()     // 执行消息获取操作
  }

doWork方法中只做两件事，分别是maybeTruncate负责副本截断和maybeFetch负责消息获取。

maybeTruncate方法

private def maybeTruncate(): Unit = {
    // 将所有处于截断中状态的分区依据有无Leader Epoch值进行分组
    val (partitionsWithEpochs, partitionsWithoutEpochs) = fetchTruncatingPartitions()
    // 对于有Leader Epoch值的分区，将日志截断到Leader Epoch值对应的位移值处
    if (partitionsWithEpochs.nonEmpty) {
      truncateToEpochEndOffsets(partitionsWithEpochs)
    }
    // 对于没有Leader Epoch值的分区，将日志截断到高水位值处
    if (partitionsWithoutEpochs.nonEmpty) {
      truncateToHighWatermark(partitionsWithoutEpochs)
    }
  }

下面来看truncateToHighWatermark方法的实现，truncateToHighWatermark方法中调用了truncate方法，truncate是抽象方法，具体的实现在ReplicaFetcherThread类中。

private[server] def truncateToHighWatermark(partitions: Set[TopicPartition]): Unit = inLock(partitionMapLock) {
    val fetchOffsets = mutable.HashMap.empty[TopicPartition, OffsetTruncationState]
    val partitionsWithError = mutable.HashSet.empty[TopicPartition]
    // 遍历每个要执行截断操作的分区对象
    for (tp <- partitions) {
      // 获取分区的分区读取状态
      val partitionState = partitionStates.stateValue(tp)
      if (partitionState != null) {
        try {
          // 取出高水位值,分区的最大可读取位移值就是高水位值
          val highWatermark = partitionState.fetchOffset
          val truncationState = OffsetTruncationState(highWatermark, truncationCompleted = true)

          info(s"Truncating partition $tp to local high watermark $highWatermark")
          // 执行截断到高水位值,具体实现在ReplicaFetcherThread类中
          truncate(tp, truncationState)

          fetchOffsets.put(tp, truncationState)
        } catch {
          case e: KafkaStorageException =>
            info(s"Failed to truncate $tp", e)
            partitionsWithError += tp
        }
      }
    }

    handlePartitionsWithErrors(partitionsWithError)
    // 更新这组分区的分区读取状态
    updateFetchOffsetAndMaybeMarkTruncationComplete(fetchOffsets)
  }

maybeFetch方法中会调用buildFetch构造FetchRequest，buildFetch是抽象方法，具体实现在ReplicaFetcherThread类中。

private def maybeFetch(): Unit = {
    val (fetchStates, fetchRequestOpt) = inLock(partitionMapLock) {
      // partitionStates中保存的是要去获取消息的分区以及对应的状态
      val fetchStates = partitionStates.partitionStateMap.asScala
      // 为partitionStates中的分区构造FetchRequest,具体实现在ReplicaFetcherThread类中
      val ResultWithPartitions(fetchRequestOpt, partitionsWithError) = buildFetch(fetchStates)
      // 处理出错的分区,处理方式主要是将这个分区加入到有序Map末尾
      handlePartitionsWithErrors(partitionsWithError)
      // 如果当前没有可读取的分区，则等待fetchBackOffMs时间等候后续重试
      if (fetchRequestOpt.isEmpty) {
        trace(s"There are no active partitions. Back off for $fetchBackOffMs ms before sending a fetch request")
        partitionMapCond.await(fetchBackOffMs, TimeUnit.MILLISECONDS)
      }

      (fetchStates, fetchRequestOpt)
    }

    fetchRequestOpt.foreach { fetchRequest =>
      // 发送FETCH请求给Leader副本，并处理Response
      processFetchRequest(fetchStates, fetchRequest)
    }
  }

processFetchRequest方法会调用processPartitionData完成Response的处理，processPartitionData方法是抽象方法，具体实现在ReplicaFetcherThread类中。

private def processFetchRequest(fetchStates: Map[TopicPartition, PartitionFetchState],
                                  fetchRequest: FetchRequest.Builder): Unit = {
    val partitionsWithError = mutable.Set[TopicPartition]()
    var responseData: Seq[(TopicPartition, FetchData)] = Seq.empty

    try {
      trace(s"Sending fetch request $fetchRequest")
      // 从leader 分区拉取数据
      responseData = fetchFromLeader(fetchRequest)
    } catch {
      case t: Throwable =>
        if (isRunning) {
          warn(s"Error in response for fetch request $fetchRequest", t)
          inLock(partitionMapLock) {
            partitionsWithError ++= partitionStates.partitionSet.asScala
            partitionMapCond.await(fetchBackOffMs, TimeUnit.MILLISECONDS)
          }
        }
    }
    // 更新请求发送速率指标
    fetcherStats.requestRate.mark()

    if (responseData.nonEmpty) {
      // process fetched data
      inLock(partitionMapLock) {
        responseData.foreach { case (topicPartition, partitionData) =>
          Option(partitionStates.stateValue(topicPartition)).foreach { currentFetchState =>
            val fetchState = fetchStates(topicPartition)
            // 处理Response的条件：
            // 1. 要获取的位移值和之前已保存的下一条待获取位移值相等
            // 2. 当前分区处于可获取状态
            if (fetchState.fetchOffset == currentFetchState.fetchOffset && currentFetchState.isReadyForFetch) {
              partitionData.error match {
                  // 没有错误
                case Errors.NONE =>
                  try {
                    // 完成Response的处理,具体实现在ReplicaFetcherThread类中
                    val logAppendInfoOpt = processPartitionData(topicPartition, currentFetchState.fetchOffset,
                      partitionData)
                    logAppendInfoOpt.foreach { logAppendInfo =>
                      val validBytes = logAppendInfo.validBytes
                      val nextOffset = if (validBytes > 0) logAppendInfo.lastOffset + 1 else currentFetchState.fetchOffset
                      fetcherLagStats.getAndMaybePut(topicPartition).lag = Math.max(0L, partitionData.highWatermark - nextOffset)

                      if (validBytes > 0 && partitionStates.contains(topicPartition)) {
                        val newFetchState = PartitionFetchState(nextOffset, fetchState.currentLeaderEpoch,
                          state = Fetching)
                        partitionStates.updateAndMoveToEnd(topicPartition, newFetchState)
                        fetcherStats.byteRate.mark(validBytes)
                      }
                    }
                  } catch {
                    case ime: CorruptRecordException =>
                      error(s"Found invalid messages during fetch for partition $topicPartition " +
                        s"offset ${currentFetchState.fetchOffset}", ime)
                      partitionsWithError += topicPartition
                    case e: KafkaStorageException =>
                      error(s"Error while processing data for partition $topicPartition", e)
                      partitionsWithError += topicPartition
                    case e: Throwable =>
                      throw new KafkaException(s"Error processing data for partition $topicPartition " +
                        s"offset ${currentFetchState.fetchOffset}", e)
                  }
                // 如果读取位移值越界，通常是因为Leader发生变更
                case Errors.OFFSET_OUT_OF_RANGE =>
                  if (!handleOutOfRangeError(topicPartition, currentFetchState))
                    // 加入到出错分区列表
                    partitionsWithError += topicPartition
                // 如果Leader Epoch值比Leader所在Broker上的Epoch值要新
                case Errors.UNKNOWN_LEADER_EPOCH =>
                  debug(s"Remote broker has a smaller leader epoch for partition $topicPartition than " +
                    s"this replica's current leader epoch of ${fetchState.currentLeaderEpoch}.")
                  // 加入到出错分区列表
                  partitionsWithError += topicPartition
                // 如果Leader Epoch值比Leader所在Broker上的Epoch值要旧
                case Errors.FENCED_LEADER_EPOCH =>
                  onPartitionFenced(topicPartition)
                // 如果Leader发生变更
                case Errors.NOT_LEADER_FOR_PARTITION =>
                  debug(s"Remote broker is not the leader for partition $topicPartition, which could indicate " +
                    "that the partition is being moved")
                  // 加入到出错分区列表
                  partitionsWithError += topicPartition

                case _ =>
                  error(s"Error for partition $topicPartition at offset ${currentFetchState.fetchOffset}",
                    partitionData.error.exception)
                  partitionsWithError += topicPartition
              }
            }
          }
        }
      }
    }

    if (partitionsWithError.nonEmpty) {
      debug(s"Handling errors for partitions $partitionsWithError")
      // 处理出错分区列表
      handlePartitionsWithErrors(partitionsWithError)
    }
  }

2.ReplicaFetcherThread

首先看下ReplicaFetcherThread类定义

class ReplicaFetcherThread(name: String,
                           fetcherId: Int,
                           sourceBroker: BrokerEndPoint,
                           brokerConfig: KafkaConfig,
                           replicaMgr: ReplicaManager,
                           metrics: Metrics,
                           time: Time,
                           quota: ReplicaQuota,
                           leaderEndpointBlockingSend: Option[BlockingSend] = None)
  extends AbstractFetcherThread(name = name,
                                clientId = name,
                                sourceBroker = sourceBroker,
                                fetchBackOffMs = brokerConfig.replicaFetchBackoffMs,
                                isInterruptible = false) {
  ...
 // follower发送tetch请求被处理返回前的最长等待时间
  private val maxWait = brokerConfig.replicaFetchWaitMaxMs
  // 每个fetch Response返回前必须要累积的最少字节数
  private val minBytes = brokerConfig.replicaFetchMinBytes
  // 每个合法fetch Response的最大字节数
  private val maxBytes = brokerConfig.replicaFetchResponseMaxBytes
  // 单个分区能够获取到的最大字节数
  private val fetchSize = brokerConfig.replicaFetchMaxBytes
  ...
                                
}

接下来看ReplicaFetcherThread 的 3 个重要方法如何实现：processPartitionData、buildFetch 和 truncate。

1.processPartitionData方法

 override def processPartitionData(topicPartition: TopicPartition,
                                    fetchOffset: Long,
                                    partitionData: FetchData): Option[LogAppendInfo] = {
    val replica = replicaMgr.localReplicaOrException(topicPartition)
    // 从副本管理器获取指定主题分区对象
    val partition = replicaMgr.getPartition(topicPartition).get
    // 将获取到的数据转换成符合格式要求的消息集合
    val records = toMemoryRecords(partitionData.records)

    maybeWarnIfOversizedRecords(records, topicPartition)
    // 要读取的起始位移值如果不是本地日志LEO值则视为异常情况
    if (fetchOffset != replica.logEndOffset)
      throw new IllegalStateException("Offset mismatch for partition %s: fetched offset = %d, log end offset = %d.".format(
        topicPartition, fetchOffset, replica.logEndOffset))

    if (isTraceEnabled)
      trace("Follower has replica log end offset %d for partition %s. Received %d messages and leader hw %d"
        .format(replica.logEndOffset, topicPartition, records.sizeInBytes, partitionData.highWatermark))

    // 写入Follower副本本地日志
    val logAppendInfo = partition.appendRecordsToFollowerOrFutureReplica(records, isFuture = false)

    if (isTraceEnabled)
      trace("Follower has replica log end offset %d after appending %d bytes of messages for partition %s"
        .format(replica.logEndOffset, records.sizeInBytes, topicPartition))
    val followerHighWatermark = replica.logEndOffset.min(partitionData.highWatermark)
    val leaderLogStartOffset = partitionData.logStartOffset
    // 更新Follower副本的高水位值
    replica.highWatermark = new LogOffsetMetadata(followerHighWatermark)
    // 尝试更新Follower副本的Log Start Offset值
    replica.maybeIncrementLogStartOffset(leaderLogStartOffset)
    if (isTraceEnabled)
      trace(s"Follower set replica high watermark for partition $topicPartition to $followerHighWatermark")
      
    // 副本消息拉取限流
    if (quota.isThrottled(topicPartition))
      quota.record(records.sizeInBytes)
    replicaMgr.brokerTopicStats.updateReplicationBytesIn(records.sizeInBytes)

    logAppendInfo
  }

2.buildFetch方法

override def buildFetch(partitionMap: Map[TopicPartition, PartitionFetchState]): ResultWithPartitions[Option[FetchRequest.Builder]] = {
    val partitionsWithError = mutable.Set[TopicPartition]()

    val builder = fetchSessionHandler.newBuilder()
    // 遍历每个分区，将处于可获取状态的分区添加到builder后续统一处理
    // 对于有错误的分区加入到出错分区列表
    partitionMap.foreach { case (topicPartition, fetchState) =>
    if (fetchState.isReadyForFetch && !shouldFollowerThrottle(quota, topicPartition)) {
        try {
          val logStartOffset = replicaMgr.localReplicaOrException(topicPartition).logStartOffset
          builder.add(topicPartition, new FetchRequest.PartitionData(
            fetchState.fetchOffset, logStartOffset, fetchSize, Optional.of(fetchState.currentLeaderEpoch)))
        } catch {
          case _: KafkaStorageException =>
            // The replica has already been marked offline due to log directory failure and the original failure should have already been logged.
            // This partition should be removed from ReplicaFetcherThread soon by ReplicaManager.handleLogDirFailure()
            partitionsWithError += topicPartition
        }
      }
    }

    val fetchData = builder.build()
    val fetchRequestOpt = if (fetchData.sessionPartitions.isEmpty && fetchData.toForget.isEmpty) {
      None
    } else {
      // 构造FETCH请求的Builder对象
      val requestBuilder = FetchRequest.Builder
        .forReplica(fetchRequestVersion, replicaId, maxWait, minBytes, fetchData.toSend)
        .setMaxBytes(maxBytes)
        .toForget(fetchData.toForget)
        .metadata(fetchData.metadata)
      Some(requestBuilder)
    }
    // 返回Builder对象以及出错分区列表
    ResultWithPartitions(fetchRequestOpt, partitionsWithError)
  }

3.truncate方法

override def truncate(tp: TopicPartition, offsetTruncationState: OffsetTruncationState): Unit = {
    val replica = replicaMgr.localReplicaOrException(tp)
    // 拿到分区对象
    val partition = replicaMgr.getPartition(tp).get
    // 执行截断操作，截断到的位置由offsetTruncationState的offset指定
    partition.truncateTo(offsetTruncationState.offset, isFuture = false)

    if (offsetTruncationState.offset < replica.highWatermark.messageOffset)
      warn(s"Truncating $tp to offset ${offsetTruncationState.offset} below high watermark " +
        s"${replica.highWatermark.messageOffset}")

    // mark the future replica for truncation only when we do last truncation
    if (offsetTruncationState.truncationCompleted)
      replicaMgr.replicaAlterLogDirsManager.markPartitionsForTruncation(brokerConfig.brokerId, tp,
        offsetTruncationState.offset)
  }