Spark-BlockManager、DiskStore、MemoryStore、BlockInfoManager、BlockInfo 源码解析
class BlockInfo & Object BlockInfo
这个类的作用是实现Block的 读写锁的功能,所以包含Storage Level,readCount,writeTask,size(the block (in bytes))。
Object BlockInfo 这个伴生对象里面主要是一些类常量:
val NO_WRITER: Long = -1
val NON_TASK_WRITER: Long = -1024
下面来看看 class BlockInfo 的源码:
//读写锁 次数统计
private[storage] class BlockInfo(
val level: StorageLevel,
val classTag: ClassTag[_],
val tellMaster: Boolean) {
/**
* The size of the block (in bytes)
*/
def size: Long = _size
def size_=(s: Long): Unit = {
_size = s
checkInvariants()
}
private[this] var _size: Long = 0 // block 的大小 byte单位
/**
* The number of times that this block has been locked for reading.
*/
def readerCount: Int = _readerCount
def readerCount_=(c: Int): Unit = {
_readerCount = c
checkInvariants()
}
private[this] var _readerCount: Int = 0 // block 的 锁读次数 表示有多少个task正在读这个 block
/**
* The task attempt id of the task which currently holds the write lock for this block, or
* [[BlockInfo.NON_TASK_WRITER]] if the write lock is held by non-task code, or
* [[BlockInfo.NO_WRITER]] if this block is not locked for writing.
*/
def writerTask: Long = _writerTask
def writerTask_=(t: Long): Unit = {
_writerTask = t
checkInvariants()
}
private[this] var _writerTask: Long = BlockInfo.NO_WRITER // block 锁写次数 BlockInfo.NO_WRITER 的初始值为 -1,表示是否有 task 正在 写这个 block
private def checkInvariants(): Unit = {
// A block's reader count must be non-negative:
assert(_readerCount >= 0)
// A block is either locked for reading or for writing, but not for both at the same time:
assert(_readerCount == 0 || _writerTask == BlockInfo.NO_WRITER)
}
checkInvariants()
}
class BlockInfoManager
BlockInfo 是实现了一个 block的读写锁,而这个BlockInfoManager是管理这个节点的全部 block 的读写锁。因为可能存在一个task 可以一次读取和写入多个 block,所以这个类种会存在writeLocksByTask、readLocksByTask 的 task 和 block 的一对多的关系。下面来看看源码:
// BlockInfoManager 管理块的锁
private[storage] class BlockInfoManager extends Logging {
private type TaskAttemptId = Long
/**
* Used to look up metadata for individual blocks. Entries are added to this map via an atomic
* set-if-not-exists operation ([[lockNewBlockForWriting()]]) and are removed
* by [[removeBlock()]].
*/
@GuardedBy("this") //BlockId 和 BlockInfo 的key-value map
private[this] val infos = new mutable.HashMap[BlockId, BlockInfo]
/**
* Tracks the set of blocks that each task has locked for writing.
*/
@GuardedBy("this") // taskID 与 BlockId 的 key-value map
private[this] val writeLocksByTask =
new mutable.HashMap[TaskAttemptId, mutable.Set[BlockId]]
with mutable.MultiMap[TaskAttemptId, BlockId]
/**
* Tracks the set of blocks that each task has locked for reading, along with the number of times
* that a block has been locked (since our read locks are re-entrant).
*/
@GuardedBy("this") // taskID 与 set BlockId 的 key-value map
private[this] val readLocksByTask =
new mutable.HashMap[TaskAttemptId, ConcurrentHashMultiset[BlockId]]
// ----------------------------------------------------------------------------------------------
// Initialization for special task attempt ids:
registerTask(BlockInfo.NON_TASK_WRITER) // BlockInfo.NON_TASK_WRITER -1024 特殊的标识
// ----------------------------------------------------------------------------------------------
/**
* Called at the start of a task in order to register that task with this [[BlockInfoManager]].
* This must be called prior to calling any other BlockInfoManager methods from that task.
*/
// 注册 taskID readLockByTask
def registerTask(taskAttemptId: TaskAttemptId): Unit = synchronized {
require(!readLocksByTask.contains(taskAttemptId),
s"Task attempt $taskAttemptId is already registered") //判断 mutable.HashMap[TaskAttemptId, ConcurrentHashMultiset[BlockId]] 中是否存在这个 taskID
readLocksByTask(taskAttemptId) = ConcurrentHashMultiset.create() // put 这个 task 进去
}
/**
* Returns the current task's task attempt id (which uniquely identifies the task), or
* [[BlockInfo.NON_TASK_WRITER]] if called by a non-task thread.
*/
//获取任务上下文TaskContext中当前正在执行的任务尝试的TaskAttemptId。如果任务上下文TaskContext中没有任务尝试的TaskAttemptId,那么返回BlockInfo.NOM_TASK_WRITER
private def currentTaskAttemptId: TaskAttemptId = {
Option(TaskContext.get()).map(_.taskAttemptId()).getOrElse(BlockInfo.NON_TASK_WRITER)
}
/**
* Lock a block for reading and return its metadata.
*
* If another task has already locked this block for reading, then the read lock will be
* immediately granted to the calling task and its lock count will be incremented.
*
* If another task has locked this block for writing, then this call will block until the write
* lock is released or will return immediately if `blocking = false`.
*
* A single task can lock a block multiple times for reading, in which case each lock will need
* to be released separately.
*
* @param blockId the block to lock.
* @param blocking if true (default), this call will block until the lock is acquired. If false,
* this call will return immediately if the lock acquisition fails.
* @return None if the block did not exist or was removed (in which case no lock is held), or
* Some(BlockInfo) (in which case the block is locked for reading).
*/
// 锁读
def lockForReading(
blockId: BlockId,
blocking: Boolean = true): Option[BlockInfo] = synchronized { //对象锁
logTrace(s"Task $currentTaskAttemptId trying to acquire read lock for $blockId")
do {
infos.get(blockId) match { // infos : BlockId 和 BlockInfo 的key-value map
case None => return None // 没有这个 BlockId 的直接返回 None
case Some(info) =>
if (info.writerTask == BlockInfo.NO_WRITER) { //如果 infos 存在这个 BlockId,且没有 写数据 锁的占用
info.readerCount += 1 // block 的 锁读次数 +1
readLocksByTask(currentTaskAttemptId).add(blockId) //readLocksByTask : taskID 与 set BlockId 的 key-value map ;readLocksByTask(currentTaskAttemptId) add 这个blockId
logTrace(s"Task $currentTaskAttemptId acquired read lock for $blockId")
return Some(info) //返回这个 BlockId
}
}
if (blocking) { // 如果 infos 存在这个 BlockId,且还有 写过数据 则会等待其他的方法 激活
wait()
}
} while (blocking)
None
}
/**
* Lock a block for writing and return its metadata.
*
* If another task has already locked this block for either reading or writing, then this call
* will block until the other locks are released or will return immediately if `blocking = false`.
*
* @param blockId the block to lock.
* @param blocking if true (default), this call will block until the lock is acquired. If false,
* this call will return immediately if the lock acquisition fails.
* @return None if the block did not exist or was removed (in which case no lock is held), or
* Some(BlockInfo) (in which case the block is locked for writing).
*/
// 锁写
def lockForWriting(
blockId: BlockId,
blocking: Boolean = true): Option[BlockInfo] = synchronized {
logTrace(s"Task $currentTaskAttemptId trying to acquire write lock for $blockId")
do {
infos.get(blockId) match { // infos : BlockId 和 BlockInfo 的key-value map
case None => return None // 没有这个 BlockId 的直接返回 None
case Some(info) => //这个 block 第一次 写的时候 需要手动在 infos 中先 put 这个 blockId, 可以看 BlockMaanger 的 lockNewBlockForWriting 实例
if (info.writerTask == BlockInfo.NO_WRITER && info.readerCount == 0) { //如果 infos 存在这个 BlockId,且没有 写数据, 并且没有 读的任务
info.writerTask = currentTaskAttemptId //更新 写锁标志
writeLocksByTask.addBinding(currentTaskAttemptId, blockId) //更新 writeLocksByTask 返回信息
logTrace(s"Task $currentTaskAttemptId acquired write lock for $blockId")
return Some(info)
}
}
if (blocking) { // 其他情况 则会等待其他的方法 激活
wait()
}
} while (blocking)
None
}
/**
* Throws an exception if the current task does not hold a write lock on the given block.
* Otherwise, returns the block's BlockInfo.
*/
// 当前 task 持有这个 写锁,否则 抛出异常;其他情况 返回这个 block 的BlockInfo
def assertBlockIsLockedForWriting(blockId: BlockId): BlockInfo = synchronized {
infos.get(blockId) match {
case Some(info) =>
if (info.writerTask != currentTaskAttemptId) {
throw new SparkException(
s"Task $currentTaskAttemptId has not locked block $blockId for writing")
} else {
info
}
case None =>
throw new SparkException(s"Block $blockId does not exist")
}
}
/**
* Get a block's metadata without acquiring any locks. This method is only exposed for use by
* [[BlockManager.getStatus()]] and should not be called by other code outside of this class.
*/
//直接 获取这个 blockid 的 BlockInfo
private[storage] def get(blockId: BlockId): Option[BlockInfo] = synchronized {
infos.get(blockId)
}
/**
* Downgrades an exclusive write lock to a shared read lock.
*/
//锁降级 锁降级指当前线程把持住写锁再获取到读锁,随后释放先前拥有的写锁的过程
def downgradeLock(blockId: BlockId): Unit = synchronized { //执行这个 锁降级的前提是,有写锁锁定
logTrace(s"Task $currentTaskAttemptId downgrading write lock for $blockId")
val info: BlockInfo = get(blockId).get
require(info.writerTask == currentTaskAttemptId,
s"Task $currentTaskAttemptId tried to downgrade a write lock that it does not hold on" +
s" block $blockId")
unlock(blockId)
val lockOutcome = lockForReading(blockId, blocking = false)
assert(lockOutcome.isDefined)
}
/**
* Release a lock on the given block.
* In case a TaskContext is not propagated properly to all child threads for the task, we fail to
* get the TID from TaskContext, so we have to explicitly pass the TID value to release the lock.
*
* See SPARK-18406 for more discussion of this issue.
*/
//释放锁
def unlock(blockId: BlockId, taskAttemptId: Option[TaskAttemptId] = None): Unit = synchronized {
val taskId = taskAttemptId.getOrElse(currentTaskAttemptId)
logTrace(s"Task $taskId releasing lock for $blockId")
val info: BlockInfo = get(blockId).getOrElse { //获取blockInfo 信息
throw new IllegalStateException(s"Block $blockId not found")
}
if (info.writerTask != BlockInfo.NO_WRITER) { // 如果有 写锁占有
info.writerTask = BlockInfo.NO_WRITER //更新为 不占有 写锁
writeLocksByTask.removeBinding(taskId, blockId) //移除 writeLocksByTask 中的信息
} else { // 如果没有 其他读锁占有
assert(info.readerCount > 0, s"Block $blockId is not locked for reading") //有读锁占用
info.readerCount -= 1 //读锁减 1
val countsForTask: ConcurrentHashMultiset[BlockId] = readLocksByTask(taskId) //获取 这个 taskID 的所有的 BlockId
val newPinCountForTask: Int = countsForTask.remove(blockId, 1) - 1
assert(newPinCountForTask >= 0,
s"Task $taskId release lock on block $blockId more times than it acquired it")
}
notifyAll() // 唤醒 其他的 wait
}
/**
* Attempt to acquire the appropriate lock for writing a new block.
*
* This enforces the first-writer-wins semantics. If we are the first to write the block,
* then just go ahead and acquire the write lock. Otherwise, if another thread is already
* writing the block, then we wait for the write to finish before acquiring the read lock.
*
* @return true if the block did not already exist, false otherwise. If this returns false, then
* a read lock on the existing block will be held. If this returns true, a write lock on
* the new block will be held.
*/
// 尝试新的block 写锁
def lockNewBlockForWriting(
blockId: BlockId,
newBlockInfo: BlockInfo): Boolean = synchronized {
logTrace(s"Task $currentTaskAttemptId trying to put $blockId")
lockForReading(blockId) match {
case Some(info) => //如果有这个blockId 的 读锁存在,则返回false
// Block already exists. This could happen if another thread races with us to compute
// the same block. In this case, just keep the read lock and return.
false
case None =>
// Block does not yet exist or is removed, so we are free to acquire the write lock
infos(blockId) = newBlockInfo //如果没有有这个blockId 的 读锁存在,则 更新 在 infos 更新 BlockInfo信息
lockForWriting(blockId) //开始锁写 这种情况下会
true
}
}
/**
* Release all lock held by the given task, clearing that task's pin bookkeeping
* structures and updating the global pin counts. This method should be called at the
* end of a task (either by a task completion handler or in `TaskRunner.run()`).
*
* @return the ids of blocks whose pins were released
*/
// 释放这个 task 的所有的 BlockId
def releaseAllLocksForTask(taskAttemptId: TaskAttemptId): Seq[BlockId] = synchronized {
val blocksWithReleasedLocks = mutable.ArrayBuffer[BlockId]()
val readLocks: Multiset[BlockId] with io.Serializable = readLocksByTask.remove(taskAttemptId).getOrElse(ImmutableMultiset.of[BlockId]())
val writeLocks: (Int with BlockId => Boolean) with Iterable[BlockId] = writeLocksByTask.remove(taskAttemptId).getOrElse(Seq.empty)
for (blockId <- writeLocks) { //先 更新 写锁
infos.get(blockId).foreach { info =>
assert(info.writerTask == taskAttemptId)
info.writerTask = BlockInfo.NO_WRITER
}
blocksWithReleasedLocks += blockId
}
readLocks.entrySet().iterator().asScala.foreach { entry =>
val blockId = entry.getElement
val lockCount = entry.getCount
blocksWithReleasedLocks += blockId
get(blockId).foreach { info =>
info.readerCount -= lockCount
assert(info.readerCount >= 0)
}
}
notifyAll()
blocksWithReleasedLocks
}
/** Returns the number of locks held by the given task. Used only for testing. */
private[storage] def getTaskLockCount(taskAttemptId: TaskAttemptId): Int = {
readLocksByTask.get(taskAttemptId).map(_.size()).getOrElse(0) +
writeLocksByTask.get(taskAttemptId).map(_.size).getOrElse(0)
}
/**
* Returns the number of blocks tracked.
*/
// infos set 的长度
def size: Int = synchronized {
infos.size // info : mutable.HashMap[BlockId, BlockInfo]
}
/**
* Return the number of map entries in this pin counter's internal data structures.
* This is used in unit tests in order to detect memory leaks.
*/
private[storage] def getNumberOfMapEntries: Long = synchronized {
size +
readLocksByTask.size +
readLocksByTask.map(_._2.size()).sum +
writeLocksByTask.size +
writeLocksByTask.map(_._2.size).sum
}
/**
* Returns an iterator over a snapshot of all blocks' metadata. Note that the individual entries
* in this iterator are mutable and thus may reflect blocks that are deleted while the iterator
* is being traversed.
*/
// infos 转 元组 array
def entries: Iterator[(BlockId, BlockInfo)] = synchronized {
infos.toArray.toIterator
}
/**
* Removes the given block and releases the write lock on it.
*
* This can only be called while holding a write lock on the given block.
*/
def removeBlock(blockId: BlockId): Unit = synchronized {
logTrace(s"Task $currentTaskAttemptId trying to remove block $blockId")
infos.get(blockId) match {
case Some(blockInfo) =>
if (blockInfo.writerTask != currentTaskAttemptId) {
throw new IllegalStateException(
s"Task $currentTaskAttemptId called remove() on block $blockId without a write lock")
} else {
infos.remove(blockId)
blockInfo.readerCount = 0
blockInfo.writerTask = BlockInfo.NO_WRITER
writeLocksByTask.removeBinding(currentTaskAttemptId, blockId)
}
case None =>
throw new IllegalArgumentException(
s"Task $currentTaskAttemptId called remove() on non-existent block $blockId")
}
notifyAll()
}
/**
* Delete all state. Called during shutdown.
*/
def clear(): Unit = synchronized {
infos.valuesIterator.foreach { blockInfo =>
blockInfo.readerCount = 0
blockInfo.writerTask = BlockInfo.NO_WRITER
}
infos.clear()
readLocksByTask.clear()
writeLocksByTask.clear()
notifyAll()
}
}
class MemoryStore & class DiskStore
这2个类是实现 block data 缓存在 Storage内存和磁盘的主要实现类。如果是MemoryStore 类的话,还需要与MemoryManager节点的内存管理器交互,防止出现内存溢出等异常出现。
下面是MemoryStore 的源码:
// 存储 blocks在内存中
private[spark] class MemoryStore(
conf: SparkConf,
blockInfoManager: BlockInfoManager,
serializerManager: SerializerManager,
memoryManager: MemoryManager,
blockEvictionHandler: BlockEvictionHandler)
extends Logging {
// Note: all changes to memory allocations, notably putting blocks, evicting blocks, and
// acquiring or releasing unroll memory, must be synchronized on `memoryManager`!
//保存 BlockId 的 value和size
private val entries = new LinkedHashMap[BlockId, MemoryEntry[_]](32, 0.75f, true)
// A mapping from taskAttemptId to amount of memory used for unrolling a block (in bytes)
// All accesses of this map are assumed to have manually synchronized on `memoryManager`
private val onHeapUnrollMemoryMap = mutable.HashMap[Long, Long]() //mapping taskId与内存量 堆上
// Note: off-heap unroll memory is only used in putIteratorAsBytes() because off-heap caching
// always stores serialized values.
private val offHeapUnrollMemoryMap = mutable.HashMap[Long, Long]() //mapping taskId与内存量 堆外
// Initial memory to request before unrolling any block
private val unrollMemoryThreshold: Long = //这一部分unroll内存 默认1M
conf.getLong("spark.storage.unrollMemoryThreshold", 1024 * 1024)
/** Total amount of memory available for storage, in bytes. */
private def maxMemory: Long = { // 最大的 Storage内存 = 最大堆上的Storage内存 + 最大堆外Storage内存
memoryManager.maxOnHeapStorageMemory + memoryManager.maxOffHeapStorageMemory
}
if (maxMemory < unrollMemoryThreshold) {
logWarning(s"Max memory ${Utils.bytesToString(maxMemory)} is less than the initial memory " +
s"threshold ${Utils.bytesToString(unrollMemoryThreshold)} needed to store a block in " +
s"memory. Please configure Spark with more memory.")
}
logInfo("MemoryStore started with capacity %s".format(Utils.bytesToString(maxMemory)))
/** Total storage memory used including unroll memory, in bytes. */
// 总的 storage 使用量
private def memoryUsed: Long = memoryManager.storageMemoryUsed //堆上+堆外
/**
* Amount of storage memory, in bytes, used for caching blocks.
* This does not include memory used for unrolling.
*/
// cache block 的内存使用量,不包括 unroll内存
private def blocksMemoryUsed: Long = memoryManager.synchronized {
memoryUsed - currentUnrollMemory
}
//这个 block的size
def getSize(blockId: BlockId): Long = {
entries.synchronized {
entries.get(blockId).size
}
}
/**
* Use `size` to test if there is enough space in MemoryStore. If so, create the ByteBuffer and
* put it into MemoryStore. Otherwise, the ByteBuffer won't be created.
*
* The caller should guarantee that `size` is correct.
*
* @return true if the put() succeeded, false otherwise.
*/
//缓存数据到 Storage 内存中 中
def putBytes[T: ClassTag](
blockId: BlockId,
size: Long,
memoryMode: MemoryMode, //堆上 or 堆外
_bytes: () => ChunkedByteBuffer): Boolean = { //_bytes 就是 要写的 数据
require(!contains(blockId), s"Block $blockId is already present in the MemoryStore") //肯定原来不存在
if (memoryManager.acquireStorageMemory(blockId, size, memoryMode)) { //开始计算内存,申请 Storage Memory,申请到的话,返回true 此时memoryManager已经更新了 内存的使用量
// We acquired enough memory for the block, so go ahead and put it
//
val bytes: ChunkedByteBuffer = _bytes()
assert(bytes.size == size) //检查大小 size
val entry = new SerializedMemoryEntry[T](bytes, memoryMode, implicitly[ClassTag[T]])
entries.synchronized { //put SerializedMemoryEntry对象到entries
entries.put(blockId, entry)
}
logInfo("Block %s stored as bytes in memory (estimated size %s, free %s)".format(
blockId, Utils.bytesToString(size), Utils.bytesToString(maxMemory - blocksMemoryUsed)))
true
} else {
false
}
}
/**
* Attempt to put the given block in memory store as values.
*
* It's possible that the iterator is too large to materialize and store in memory. To avoid
* OOM exceptions, this method will gradually unroll the iterator while periodically checking
* whether there is enough free memory. If the block is successfully materialized, then the
* temporary unroll memory used during the materialization is "transferred" to storage memory,
* so we won't acquire more memory than is actually needed to store the block.
*
* @return in case of success, the estimated size of the stored data. In case of failure, return
* an iterator containing the values of the block. The returned iterator will be backed
* by the combination of the partially-unrolled block and the remaining elements of the
* original input iterator. The caller must either fully consume this iterator or call
* `close()` on it in order to free the storage memory consumed by the partially-unrolled
* block.
*/
// 缓存数据到 Storage 内存中 values 是 Iterator类型
private[storage] def putIteratorAsValues[T](
blockId: BlockId,
values: Iterator[T],
classTag: ClassTag[T]): Either[PartiallyUnrolledIterator[T], Long] = { //返回的
require(!contains(blockId), s"Block $blockId is already present in the MemoryStore") //肯定原来不存在
// Number of elements unrolled so far
var elementsUnrolled = 0
// Whether there is still enough memory for us to continue unrolling this block
var keepUnrolling = true //循环申请 unroll 内存 标志
// Initial per-task memory to request for unrolling blocks (bytes).
val initialMemoryThreshold = unrollMemoryThreshold // 1M
// How often to check whether we need to request more memory
val memoryCheckPeriod: Long = conf.get(UNROLL_MEMORY_CHECK_PERIOD) //默认 16
// Memory currently reserved by this task for this particular unrolling operation
var memoryThreshold = initialMemoryThreshold // 1M
// Memory to request as a multiple of current vector size
val memoryGrowthFactor = conf.get(UNROLL_MEMORY_GROWTH_FACTOR) //默认 1.5
// Keep track of unroll memory used by this particular block / putIterator() operation
var unrollMemoryUsedByThisBlock = 0L
// Underlying vector for unrolling the block
var vector: SizeTrackingVector[T] = new SizeTrackingVector[T]()(classTag)
// Request enough memory to begin unrolling
keepUnrolling =
reserveUnrollMemoryForThisTask(blockId, initialMemoryThreshold, MemoryMode.ON_HEAP) //实际是 申请 1M Storage内存,并且 更新
if (!keepUnrolling) { //一般会申请到的
logWarning(s"Failed to reserve initial memory threshold of " +
s"${Utils.bytesToString(initialMemoryThreshold)} for computing block $blockId in memory.")
} else {
unrollMemoryUsedByThisBlock += initialMemoryThreshold //unrollMemoryUsedByThisBlock更新为1M
}
// Unroll this block safely, checking whether we have exceeded our threshold periodically
while (values.hasNext && keepUnrolling) { //values 依次 处理
vector += values.next() //这里的目的是累计到一定的量 再一次 申请内存,不要次次都申请
if (elementsUnrolled % memoryCheckPeriod == 0) { //elementsUnrolled 从0开始递增 memoryCheckPeriod 默认16
// If our vector's size has exceeded the threshold, request more memory
val currentSize = vector.estimateSize() //使用 vector 评估 内存量
if (currentSize >= memoryThreshold) { // memoryThreshold 1M
// memoryGrowthFactor增长系数 默认1.5 这里为什么要 减去memoryThreshold 因为上面已经申请了一次了
val amountToRequest = (currentSize * memoryGrowthFactor - memoryThreshold).toLong
keepUnrolling =
reserveUnrollMemoryForThisTask(blockId, amountToRequest, MemoryMode.ON_HEAP) //再次申请 一定量的内存
if (keepUnrolling) { //申请到的话
unrollMemoryUsedByThisBlock += amountToRequest // unrollMemoryUsedByThisBlock + amountToRequest
}
// New threshold is currentSize * memoryGrowthFactor
memoryThreshold += amountToRequest //更新 memoryThreshold 这个 内存阈值 增加阈值
}
}
elementsUnrolled += 1 // elementsUnrolled 自增
}
if (keepUnrolling) { //这里这个是TRUE的话,说明整个values 已经都可以申请到内存
// We successfully unrolled the entirety of this block
val arrayValues = vector.toArray
vector = null
val entry =
new DeserializedMemoryEntry[T](arrayValues, SizeEstimator.estimate(arrayValues), classTag) //这个 entry 需要保存在entries中
val size = entry.size //内存量
def transferUnrollToStorage(amount: Long): Unit = { //申请 Storage 内存,释放 Unroll 内存
// Synchronize so that transfer is atomic
memoryManager.synchronized {
releaseUnrollMemoryForThisTask(MemoryMode.ON_HEAP, amount)
val success = memoryManager.acquireStorageMemory(blockId, amount, MemoryMode.ON_HEAP)
assert(success, "transferring unroll memory to storage memory failed")
}
}
// Acquire storage memory if necessary to store this block in memory.
val enoughStorageMemory = {
if (unrollMemoryUsedByThisBlock <= size) { //unrollMemoryUsedByThisBlock 可能小于 这个entry的总量,所以可能需要 额外申请内存
val acquiredExtra: Boolean =
memoryManager.acquireStorageMemory(
blockId, size - unrollMemoryUsedByThisBlock, MemoryMode.ON_HEAP) //额外申请的大小就是 size - unrollMemoryUsedByThisBlock 的量
if (acquiredExtra) { //申请到 额外的内存的话 则这里需要 再申请一份 Storage的内存,然后 释放 unroll 的内存
transferUnrollToStorage(unrollMemoryUsedByThisBlock) 申请 Storage 内存,释放 Unroll 内存
//unrollMemoryUsedByThisBlock 这个就是 所需的总 内存量
}
acquiredExtra
} else { // unrollMemoryUsedByThisBlock > size
// If this task attempt already owns more unroll memory than is necessary to store the
// block, then release the extra memory that will not be used.
val excessUnrollMemory = unrollMemoryUsedByThisBlock - size
releaseUnrollMemoryForThisTask(MemoryMode.ON_HEAP, excessUnrollMemory) // unrollMemoryUsedByThisBlock 也可能大于 这个 entry 的总量,所以可能需要 额外申请内存
transferUnrollToStorage(size)
true
}
}
if (enoughStorageMemory) { //全部需要的 内存 都申请到 后
entries.synchronized {
entries.put(blockId, entry) ///把这个 entry put 到 entries
}
logInfo("Block %s stored as values in memory (estimated size %s, free %s)".format( //这个日志 在 spark的log里面 会 经常 看到的
blockId, Utils.bytesToString(size), Utils.bytesToString(maxMemory - blocksMemoryUsed)))
Right(size)
} else {// 内存申请 失败
assert(currentUnrollMemoryForThisTask >= unrollMemoryUsedByThisBlock,
"released too much unroll memory")
Left(new PartiallyUnrolledIterator(
this,
MemoryMode.ON_HEAP,
unrollMemoryUsedByThisBlock,
unrolled = arrayValues.toIterator,
rest = Iterator.empty))
}
} else {
// We ran out of space while unrolling the values for this block
//这里这个是false的话,说明整个values 申请不到到内存了
logUnrollFailureMessage(blockId, vector.estimateSize()) //log warn Not enough space to cache $blockId in memory!
Left(new PartiallyUnrolledIterator(
this,
MemoryMode.ON_HEAP,
unrollMemoryUsedByThisBlock,
unrolled = vector.iterator,
rest = values))
}
}
/**
* Attempt to put the given block in memory store as bytes.
*
* It's possible that the iterator is too large to materialize and store in memory. To avoid
* OOM exceptions, this method will gradually unroll the iterator while periodically checking
* whether there is enough free memory. If the block is successfully materialized, then the
* temporary unroll memory used during the materialization is "transferred" to storage memory,
* so we won't acquire more memory than is actually needed to store the block.
*
* @return in case of success, the estimated size of the stored data. In case of failure,
* return a handle which allows the caller to either finish the serialization by
* spilling to disk or to deserialize the partially-serialized block and reconstruct
* the original input iterator. The caller must either fully consume this result
* iterator or call `discard()` on it in order to free the storage memory consumed by the
* partially-unrolled block.
*/
// 和上面的 putIteratorAsValues 类似,但是 entry保存的是 序列化的数据
private[storage] def putIteratorAsBytes[T](
blockId: BlockId,
values: Iterator[T],
classTag: ClassTag[T],
memoryMode: MemoryMode): Either[PartiallySerializedBlock[T], Long] = {
require(!contains(blockId), s"Block $blockId is already present in the MemoryStore")
val allocator = memoryMode match {
case MemoryMode.ON_HEAP => ByteBuffer.allocate _
case MemoryMode.OFF_HEAP => Platform.allocateDirectBuffer _
}
// Whether there is still enough memory for us to continue unrolling this block
var keepUnrolling = true
// Number of elements unrolled so far
var elementsUnrolled = 0L
// How often to check whether we need to request more memory
val memoryCheckPeriod = conf.get(UNROLL_MEMORY_CHECK_PERIOD)
// Memory to request as a multiple of current bbos size
val memoryGrowthFactor = conf.get(UNROLL_MEMORY_GROWTH_FACTOR)
// Initial per-task memory to request for unrolling blocks (bytes).
val initialMemoryThreshold = unrollMemoryThreshold
// Keep track of unroll memory used by this particular block / putIterator() operation
var unrollMemoryUsedByThisBlock = 0L
// Underlying buffer for unrolling the block
val redirectableStream = new RedirectableOutputStream
val chunkSize = if (initialMemoryThreshold > Int.MaxValue) {
logWarning(s"Initial memory threshold of ${Utils.bytesToString(initialMemoryThreshold)} " +
s"is too large to be set as chunk size. Chunk size has been capped to " +
s"${Utils.bytesToString(Int.MaxValue)}")
Int.MaxValue
} else {
initialMemoryThreshold.toInt
}
val bbos = new ChunkedByteBufferOutputStream(chunkSize, allocator)
redirectableStream.setOutputStream(bbos)
val serializationStream: SerializationStream = {
val autoPick = !blockId.isInstanceOf[StreamBlockId]
val ser = serializerManager.getSerializer(classTag, autoPick).newInstance()
ser.serializeStream(serializerManager.wrapForCompression(blockId, redirectableStream))
}
// Request enough memory to begin unrolling
keepUnrolling = reserveUnrollMemoryForThisTask(blockId, initialMemoryThreshold, memoryMode)
if (!keepUnrolling) {
logWarning(s"Failed to reserve initial memory threshold of " +
s"${Utils.bytesToString(initialMemoryThreshold)} for computing block $blockId in memory.")
} else {
unrollMemoryUsedByThisBlock += initialMemoryThreshold
}
def reserveAdditionalMemoryIfNecessary(): Unit = {
if (bbos.size > unrollMemoryUsedByThisBlock) {
val amountToRequest = (bbos.size * memoryGrowthFactor - unrollMemoryUsedByThisBlock).toLong
keepUnrolling = reserveUnrollMemoryForThisTask(blockId, amountToRequest, memoryMode)
if (keepUnrolling) {
unrollMemoryUsedByThisBlock += amountToRequest
}
}
}
// Unroll this block safely, checking whether we have exceeded our threshold
while (values.hasNext && keepUnrolling) {
serializationStream.writeObject(values.next())(classTag)
elementsUnrolled += 1
if (elementsUnrolled % memoryCheckPeriod == 0) {
reserveAdditionalMemoryIfNecessary()
}
}
// Make sure that we have enough memory to store the block. By this point, it is possible that
// the block's actual memory usage has exceeded the unroll memory by a small amount, so we
// perform one final call to attempt to allocate additional memory if necessary.
if (keepUnrolling) {
serializationStream.close()
if (bbos.size > unrollMemoryUsedByThisBlock) {
val amountToRequest = bbos.size - unrollMemoryUsedByThisBlock
keepUnrolling = reserveUnrollMemoryForThisTask(blockId, amountToRequest, memoryMode)
if (keepUnrolling) {
unrollMemoryUsedByThisBlock += amountToRequest
}
}
}
if (keepUnrolling) {
val entry = SerializedMemoryEntry[T](bbos.toChunkedByteBuffer, memoryMode, classTag)
// Synchronize so that transfer is atomic
memoryManager.synchronized {
releaseUnrollMemoryForThisTask(memoryMode, unrollMemoryUsedByThisBlock)
val success = memoryManager.acquireStorageMemory(blockId, entry.size, memoryMode)
assert(success, "transferring unroll memory to storage memory failed")
}
entries.synchronized {
entries.put(blockId, entry)
}
logInfo("Block %s stored as bytes in memory (estimated size %s, free %s)".format(
blockId, Utils.bytesToString(entry.size),
Utils.bytesToString(maxMemory - blocksMemoryUsed)))
Right(entry.size)
} else {
// We ran out of space while unrolling the values for this block
logUnrollFailureMessage(blockId, bbos.size)
Left(
new PartiallySerializedBlock(
this,
serializerManager,
blockId,
serializationStream,
redirectableStream,
unrollMemoryUsedByThisBlock,
memoryMode,
bbos,
values,
classTag))
}
}
//获取 entrys 中的 序列化的 blockId 的value
def getBytes(blockId: BlockId): Option[ChunkedByteBuffer] = {
val entry = entries.synchronized { entries.get(blockId) }
entry match {
case null => None
case e: DeserializedMemoryEntry[_] =>
throw new IllegalArgumentException("should only call getBytes on serialized blocks")
case SerializedMemoryEntry(bytes, _, _) => Some(bytes)
}
}
//获取 entrys 中的 非序列化的 blockId 的value
def getValues(blockId: BlockId): Option[Iterator[_]] = {
val entry = entries.synchronized { entries.get(blockId) }
entry match {
case null => None
case e: SerializedMemoryEntry[_] =>
throw new IllegalArgumentException("should only call getValues on deserialized blocks")
case DeserializedMemoryEntry(values, _, _) =>
val x = Some(values)
x.map(_.iterator)
}
}
//remove blockId 在内存中的缓存
def remove(blockId: BlockId): Boolean = memoryManager.synchronized {
val entry = entries.synchronized { //从entries 移除 这个 blockId的值
entries.remove(blockId)
}
if (entry != null) {
entry match {
case SerializedMemoryEntry(buffer, _, _) => buffer.dispose() //如果是 序列化的数据 还需要 clean up any ByteBuffer
case _ =>
}
memoryManager.releaseStorageMemory(entry.size, entry.memoryMode) // 释放 对应的 Storage 内存大小
logDebug(s"Block $blockId of size ${entry.size} dropped " +
s"from memory (free ${maxMemory - blocksMemoryUsed})")
true
} else {
false
}
}
def clear(): Unit = memoryManager.synchronized {
entries.synchronized {
entries.clear()
}
onHeapUnrollMemoryMap.clear()
offHeapUnrollMemoryMap.clear()
memoryManager.releaseAllStorageMemory()
logInfo("MemoryStore cleared")
}
/**
* Return the RDD ID that a given block ID is from, or None if it is not an RDD block.
*/
private def getRddId(blockId: BlockId): Option[Int] = {
blockId.asRDDId.map(_.rddId)
}
/**
* Try to evict blocks to free up a given amount of space to store a particular block.
* Can fail if either the block is bigger than our memory or it would require replacing
* another block from the same RDD (which leads to a wasteful cyclic replacement pattern for
* RDDs that don't fit into memory that we want to avoid).
*
* @param blockId the ID of the block we are freeing space for, if any
* @param space the size of this block
* @param memoryMode the type of memory to free (on- or off-heap)
* @return the amount of memory (in bytes) freed by eviction
*/
// 当内存不够时, StorageMemoryPool 会在acquireMemory 方法中调用这个方法 来释放一些block data
private[spark] def evictBlocksToFreeSpace(
blockId: Option[BlockId],
space: Long, //需要多大的 内存量
memoryMode: MemoryMode): Long = { //堆上 or 堆外
assert(space > 0)
memoryManager.synchronized {
var freedMemory = 0L
val rddToAdd: Option[Int] = blockId.flatMap(getRddId) // rdd 的 id
val selectedBlocks = new ArrayBuffer[BlockId]
def blockIsEvictable(blockId: BlockId, entry: MemoryEntry[_]): Boolean = { //选择 非 本BlockId 和(堆上 or 堆外) 一致的 返回为 true
entry.memoryMode == memoryMode && (rddToAdd.isEmpty || rddToAdd != getRddId(blockId))
}
// This is synchronized to ensure that the set of entries is not changed
// (because of getValue or getBytes) while traversing the iterator, as that
// can lead to exceptions.
entries.synchronized {
val iterator = entries.entrySet().iterator()
while (freedMemory < space && iterator.hasNext) { // freedMemory 统计 可以释放的量
val pair = iterator.next()
val blockId = pair.getKey
val entry = pair.getValue
if (blockIsEvictable(blockId, entry)) { //依次 判断 是否可以 释放的 条件
// We don't want to evict blocks which are currently being read, so we need to obtain
// an exclusive write lock on blocks which are candidates for eviction. We perform a
// non-blocking "tryLock" here in order to ignore blocks which are locked for reading:
if (blockInfoManager.lockForWriting(blockId, blocking = false).isDefined) { // 这个 block 的 info信息 存在 且 没有正在写的 标记和 读的次数 为 0
selectedBlocks += blockId //selectedBlocks 保存 将要 释放 的 blockId
freedMemory += pair.getValue.size// freedMemory 统计 可以释放的量 增加这个 释放量
}
}
}
}
def dropBlock[T](blockId: BlockId, entry: MemoryEntry[T]): Unit = {
val data = entry match {
case DeserializedMemoryEntry(values, _, _) => Left(values)
case SerializedMemoryEntry(buffer, _, _) => Right(buffer)
}
val newEffectiveStorageLevel: StorageLevel =
blockEvictionHandler.dropFromMemory(blockId, () => data)(entry.classTag) //释放这个 blockId 的数据 ,根据当前block的 Storage 级别 判断 是 unlock 还是 removeBlock
if (newEffectiveStorageLevel.isValid) { // (useMemory || useDisk) && (replication > 0)
// The block is still present in at least one store, so release the lock
// but don't delete the block info
blockInfoManager.unlock(blockId)
} else {
// The block isn't present in any store, so delete the block info so that the
// block can be stored again
blockInfoManager.removeBlock(blockId)
}
}
if (freedMemory >= space) {//达到 space 的要求
var lastSuccessfulBlock = -1
try {
logInfo(s"${selectedBlocks.size} blocks selected for dropping " +
s"(${Utils.bytesToString(freedMemory)} bytes)") //日志提示 多少的 block 选中将要 释放掉
(0 until selectedBlocks.size).foreach { idx =>
val blockId = selectedBlocks(idx)
val entry = entries.synchronized {
entries.get(blockId)
}
// This should never be null as only one task should be dropping
// blocks and removing entries. However the check is still here for
// future safety.
if (entry != null) {
dropBlock(blockId, entry)
afterDropAction(blockId)
}
lastSuccessfulBlock = idx
}
logInfo(s"After dropping ${selectedBlocks.size} blocks, " +
s"free memory is ${Utils.bytesToString(maxMemory - blocksMemoryUsed)}")
freedMemory //返回释放了 多少的量
} finally {
// like BlockManager.doPut, we use a finally rather than a catch to avoid having to deal
// with InterruptedException
if (lastSuccessfulBlock != selectedBlocks.size - 1) {
// the blocks we didn't process successfully are still locked, so we have to unlock them
(lastSuccessfulBlock + 1 until selectedBlocks.size).foreach { idx =>
val blockId = selectedBlocks(idx)
blockInfoManager.unlock(blockId)
}
}
}
} else { //没有达到 space 的要求,即全部释放也不会 满足要求的量
blockId.foreach { id => //那么 则会提示 不会 缓存 这个 blockId
logInfo(s"Will not store $id")
}
selectedBlocks.foreach { id =>
blockInfoManager.unlock(id) //释放上面的 写锁
}
0L
}
}
}
// hook for testing, so we can simulate a race
protected def afterDropAction(blockId: BlockId): Unit = {}
//entries中是否存在这个 BlockId
def contains(blockId: BlockId): Boolean = {
entries.synchronized { entries.containsKey(blockId) }
}
//返回本线程的 taskID
private def currentTaskAttemptId(): Long = {
// In case this is called on the driver, return an invalid task attempt id.
Option(TaskContext.get()).map(_.taskAttemptId()).getOrElse(-1L)
}
/**
* Reserve memory for unrolling the given block for this task.
*
* @return whether the request is granted.
*/
// 申请 unroll内存
def reserveUnrollMemoryForThisTask(
blockId: BlockId,
memory: Long,
memoryMode: MemoryMode): Boolean = {
memoryManager.synchronized {
val success = memoryManager.acquireUnrollMemory(blockId, memory, memoryMode) //这里一样的是 申请的是 Storage的内存,如果是TRUE的话,表示申请成功,且已经
if (success) {
val taskAttemptId = currentTaskAttemptId()
val unrollMemoryMap = memoryMode match {
case MemoryMode.ON_HEAP => onHeapUnrollMemoryMap //一般是在 堆上的
case MemoryMode.OFF_HEAP => offHeapUnrollMemoryMap
}
unrollMemoryMap(taskAttemptId) = unrollMemoryMap.getOrElse(taskAttemptId, 0L) + memory //更新 unrollMemoryMap 的 taskID 的 内存使用量
}
success
}
}
/**
* Release memory used by this task for unrolling blocks.
* If the amount is not specified, remove the current task's allocation altogether.
*/
//释放 unroll 使用的的 内存
def releaseUnrollMemoryForThisTask(memoryMode: MemoryMode, memory: Long = Long.MaxValue): Unit = {
val taskAttemptId = currentTaskAttemptId()
memoryManager.synchronized {
val unrollMemoryMap = memoryMode match {
case MemoryMode.ON_HEAP => onHeapUnrollMemoryMap
case MemoryMode.OFF_HEAP => offHeapUnrollMemoryMap
}
if (unrollMemoryMap.contains(taskAttemptId)) {
val memoryToRelease = math.min(memory, unrollMemoryMap(taskAttemptId))
if (memoryToRelease > 0) {
unrollMemoryMap(taskAttemptId) -= memoryToRelease
memoryManager.releaseUnrollMemory(memoryToRelease, memoryMode)
}
if (unrollMemoryMap(taskAttemptId) == 0) {
unrollMemoryMap.remove(taskAttemptId)
}
}
}
}
/**
* Return the amount of memory currently occupied for unrolling blocks across all tasks.
*/
// 当前 unroll 内存的使用量
def currentUnrollMemory: Long = memoryManager.synchronized {
onHeapUnrollMemoryMap.values.sum + offHeapUnrollMemoryMap.values.sum
}
/**
* Return the amount of memory currently occupied for unrolling blocks by this task.
*/
def currentUnrollMemoryForThisTask: Long = memoryManager.synchronized {
onHeapUnrollMemoryMap.getOrElse(currentTaskAttemptId(), 0L) +
offHeapUnrollMemoryMap.getOrElse(currentTaskAttemptId(), 0L)
}
/**
* Return the number of tasks currently unrolling blocks.
*/
private def numTasksUnrolling: Int = memoryManager.synchronized {
(onHeapUnrollMemoryMap.keys ++ offHeapUnrollMemoryMap.keys).toSet.size
}
/**
* Log information about current memory usage.
*/
private def logMemoryUsage(): Unit = {
logInfo(
s"Memory use = ${Utils.bytesToString(blocksMemoryUsed)} (blocks) + " +
s"${Utils.bytesToString(currentUnrollMemory)} (scratch space shared across " +
s"$numTasksUnrolling tasks(s)) = ${Utils.bytesToString(memoryUsed)}. " +
s"Storage limit = ${Utils.bytesToString(maxMemory)}."
)
}
/**
* Log a warning for failing to unroll a block.
*
* @param blockId ID of the block we are trying to unroll.
* @param finalVectorSize Final size of the vector before unrolling failed.
*/
private def logUnrollFailureMessage(blockId: BlockId, finalVectorSize: Long): Unit = {
logWarning(
s"Not enough space to cache $blockId in memory! " +
s"(computed ${Utils.bytesToString(finalVectorSize)} so far)"
)
logMemoryUsage()
}
}
下面是DiskStore 的源码解读:
// 保存 Block data 到 磁盘
private[spark] class DiskStore(
conf: SparkConf,
diskManager: DiskBlockManager,
securityManager: SecurityManager) extends Logging {
private val minMemoryMapBytes = conf.getSizeAsBytes("spark.storage.memoryMapThreshold", "2m")
private val maxMemoryMapBytes = conf.getSizeAsBytes("spark.storage.memoryMapLimitForTests",
Int.MaxValue.toString)
private val blockSizes = new ConcurrentHashMap[BlockId, Long]() // blockdata 的 BlockId 和 大小量的 key-map 键值对
//获取 这个 blockId 的 容量size
def getSize(blockId: BlockId): Long = blockSizes.get(blockId)
/**
* Invokes the provided callback function to write the specific block.
*
* @throws IllegalStateException if the block already exists in the disk store.
*/
//写 数据 到磁盘
def put(blockId: BlockId)(writeFunc: WritableByteChannel => Unit): Unit = {
if (contains(blockId)) {
throw new IllegalStateException(s"Block $blockId is already present in the disk store")
}
logDebug(s"Attempting to put block $blockId")
val startTime = System.currentTimeMillis
val file = diskManager.getFile(blockId) //获取 保存的文件 绝对路径,没有的话 会自动创建 文件名称 以blockId的name结束
val out = new CountingWritableChannel(openForWrite(file))
var threwException: Boolean = true
try {
writeFunc(out)
blockSizes.put(blockId, out.getCount) //更新这个blockId 的data 大小
threwException = false
} finally {
try {
out.close()
} catch {
case ioe: IOException =>
if (!threwException) {
threwException = true
throw ioe
}
} finally {
if (threwException) {
remove(blockId)
}
}
}
val finishTime = System.currentTimeMillis
logDebug("Block %s stored as %s file on disk in %d ms".format(
file.getName,
Utils.bytesToString(file.length()),
finishTime - startTime))
}
//写入 bytes数据 到 磁盘
def putBytes(blockId: BlockId, bytes: ChunkedByteBuffer): Unit = {
put(blockId) { channel =>
bytes.writeFully(channel)
}
}
//读取 blockId data数据 返回包装的 BlockData
def getBytes(blockId: BlockId): BlockData = {
val file = diskManager.getFile(blockId.name) //获取到保存的文件对象
val blockSize = getSize(blockId) //获取的 保存的数据 的大小size
securityManager.getIOEncryptionKey() match {
case Some(key) =>
// Encrypted blocks cannot be memory mapped; return a special object that does decryption
// and provides InputStream / FileRegion implementations for reading the data.
new EncryptedBlockData(file, blockSize, conf, key)
case _ => //一般不会 配置 加密 所以走这个
new DiskBlockData(minMemoryMapBytes, maxMemoryMapBytes, file, blockSize) //返回 包含数据文件对象的 DiskBlockData 对象
}
}
//移除一个 blockId data
def remove(blockId: BlockId): Boolean = {
blockSizes.remove(blockId)
val file = diskManager.getFile(blockId.name)
if (file.exists()) {
val ret = file.delete()
if (!ret) {
logWarning(s"Error deleting ${file.getPath()}")
}
ret
} else {
false
}
}
//检查 磁盘是否存在这个 blockId 的data file
def contains(blockId: BlockId): Boolean = {
val file = diskManager.getFile(blockId.name)
file.exists()
}
//写文件数据 统一处理方法,可能要加密写数据
private def openForWrite(file: File): WritableByteChannel = {
val out = new FileOutputStream(file).getChannel()
try {
securityManager.getIOEncryptionKey().map { key =>
CryptoStreamUtils.createWritableChannel(out, conf, key)
}.getOrElse(out)
} catch {
case e: Exception =>
Closeables.close(out, true)
file.delete()
throw e
}
}
}
class BlockManager
这个类主要是 本节点的 block data 的管理,也可以从 别的executor节点 拉去 block data。
因为需要和其他的 executor节点进行block data的上传和下砸,所以需要持有一个BlockTransferService的NettyBlockTransferService实例,实现和其他节点进行获取和上传 blocks;
因为要安装一些 endPoint,所以内存会持有 RpcEnv 的 NettyRPCEnv的实例,这里会安装一个BlockManagerSlaveEndpoint 的endPoint,用来处理 driver 多 block的处理。
因为需要和 driver通信,所以内部持有 BlockManagerMaster的实例。
因为需要防止 节点的内存溢出,所以内部持有MemoryManager内存管理的实例。
因为需要实现对象的序列化和反序列化,所以内部持有SerializerManager的实例。
下面来看看这个类的源码:
// 本节点的 block data 的管理,也可以从 别的executor节点 拉去 block data,内部的slaveEndpoint 响应drive请求 操作本节点的 block
private[spark] class BlockManager(
executorId: String, //drvier 的话是 driver,executor的话 是
rpcEnv: RpcEnv,
val master: BlockManagerMaster, // blockManagerMaster 包装了一下 和 driver 的通信
val serializerManager: SerializerManager, // 序列化 管理器
val conf: SparkConf,
memoryManager: MemoryManager, //动态内存管理
mapOutputTracker: MapOutputTracker,
shuffleManager: ShuffleManager,
val blockTransferService: BlockTransferService, //是 NettyBlockTransferService 这个类的主要作用是 使用 Netty 获取和上传 blocks
securityManager: SecurityManager,
numUsableCores: Int)
extends BlockDataManager with BlockEvictionHandler with Logging { //BlockDataManager 主要 getBlockData,putBlockData,releaseLock
private[spark] val externalShuffleServiceEnabled =
conf.getBoolean("spark.shuffle.service.enabled", false) //外部的 shuffle server 默认 不开启
val diskBlockManager = { //用来管理 临时存放在 磁盘上的 文件,JVM 退出的时候 diskBlockManager会通过 钩子函数清理掉
// Only perform cleanup if an external service is not serving our shuffle files.
val deleteFilesOnStop =
!externalShuffleServiceEnabled || executorId == SparkContext.DRIVER_IDENTIFIER //这个选项一般都是true
new DiskBlockManager(conf, deleteFilesOnStop)
}
// Visible for testing
private[storage] val blockInfoManager = new BlockInfoManager //block 读写锁 的 管理,里面有缓存的block 的读写锁等信息
private val futureExecutionContext = ExecutionContext.fromExecutorService(
ThreadUtils.newDaemonCachedThreadPool("block-manager-future", 128))
// Actual storage of where blocks are kept
private[spark] val memoryStore = //把block data 存在内存管理起来
new MemoryStore(conf, blockInfoManager, serializerManager, memoryManager, this)
private[spark] val diskStore = new DiskStore(conf, diskBlockManager, securityManager)
memoryManager.setMemoryStore(memoryStore)
// Note: depending on the memory manager, `maxMemory` may actually vary over time.
// However, since we use this only for reporting and logging, what we actually want here is
// the absolute maximum value that `maxMemory` can ever possibly reach. We may need
// to revisit whether reporting this value as the "max" is intuitive to the user.
private val maxOnHeapMemory = memoryManager.maxOnHeapStorageMemory //此刻堆上 maxHeapMemory - onHeapExecutionMemoryPool.memoryUsed
private val maxOffHeapMemory = memoryManager.maxOffHeapStorageMemory//此刻堆外 maxHeapMemory - onHeapExecutionMemoryPool.memoryUsed
// Port used by the external shuffle service. In Yarn mode, this may be already be
// set through the Hadoop configuration as the server is launched in the Yarn NM.
private val externalShuffleServicePort: Int = { // spark.shuffle.service.port 端口 默认 7337
val tmpPort = Utils.getSparkOrYarnConfig(conf, "spark.shuffle.service.port", "7337").toInt
if (tmpPort == 0) {
// for testing, we set "spark.shuffle.service.port" to 0 in the yarn config, so yarn finds
// an open port. But we still need to tell our spark apps the right port to use. So
// only if the yarn config has the port set to 0, we prefer the value in the spark config
conf.get("spark.shuffle.service.port").toInt
} else {
tmpPort
}
}
var blockManagerId: BlockManagerId = _ //BlockManagerId 目前来看的话 几个executor 几个BlockManagerId的对象 有 host 和port 代表一个 block 的节点服务
// Address of the server that serves this executor's shuffle files. This is either an external
// service, or just our own Executor's BlockManager.
private[spark] var shuffleServerId: BlockManagerId = _
// Client to read other executors' shuffle files. This is either an external service, or just the
// standard BlockTransferService to directly connect to other Executors.
private[spark] val shuffleClient = if (externalShuffleServiceEnabled) { //默认使用NettyBlockTransferService
val transConf = SparkTransportConf.fromSparkConf(conf, "shuffle", numUsableCores)
new ExternalShuffleClient(transConf, securityManager,
securityManager.isAuthenticationEnabled(), conf.get(config.SHUFFLE_REGISTRATION_TIMEOUT))
} else {
blockTransferService
}
// Max number of failures before this block manager refreshes the block locations from the driver
private val maxFailuresBeforeLocationRefresh =
conf.getInt("spark.block.failures.beforeLocationRefresh", 5)
private val slaveEndpoint: RpcEndpointRef = rpcEnv.setupEndpoint( // BlockManagerSlaveEndpoint 这个存在的目的在于 响应drive请求 操作本节点的 block
"BlockManagerEndpoint" + BlockManager.ID_GENERATOR.next,
new BlockManagerSlaveEndpoint(rpcEnv, this, mapOutputTracker))
// Pending re-registration action being executed asynchronously or null if none is pending.
// Accesses should synchronize on asyncReregisterLock.
private var asyncReregisterTask: Future[Unit] = null
private val asyncReregisterLock = new Object
// Field related to peer block managers that are necessary for block replication
@volatile private var cachedPeers: Seq[BlockManagerId] = _
private val peerFetchLock = new Object
private var lastPeerFetchTime = 0L
private var blockReplicationPolicy: BlockReplicationPolicy = _
// A DownloadFileManager used to track all the files of remote blocks which are above the
// specified memory threshold. Files will be deleted automatically based on weak reference.
// Exposed for test
private[storage] val remoteBlockTempFileManager: BlockManager.RemoteBlockDownloadFileManager =
new BlockManager.RemoteBlockDownloadFileManager(this)
private val maxRemoteBlockToMem = conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM)
/**
* Initializes the BlockManager with the given appId. This is not performed in the constructor as
* the appId may not be known at BlockManager instantiation time (in particular for the driver,
* where it is only learned after registration with the TaskScheduler).
*
* This method initializes the BlockTransferService and ShuffleClient, registers with the
* BlockManagerMaster, starts the BlockManagerWorker endpoint, and registers with a local shuffle
* service if configured.
*/
//在SparkContext里面 才会调用 这个 initialize,初始化 NettyBlockTransferService ,向driver注册这个 BlockMananger
def initialize(appId: String): Unit = {
blockTransferService.init(this) //是 NettyBlockTransferService 这个类的主要作用是 使用 Netty 获取和上传 blocks
shuffleClient.init(appId)
blockReplicationPolicy = {
val priorityClass = conf.get(
"spark.storage.replication.policy", classOf[RandomBlockReplicationPolicy].getName)
val clazz = Utils.classForName(priorityClass)
val ret = clazz.newInstance.asInstanceOf[BlockReplicationPolicy]
logInfo(s"Using $priorityClass for block replication policy")
ret
}
val id =
BlockManagerId(executorId, blockTransferService.hostName, blockTransferService.port, None) //这个节点的 host port 等信息
val idFromMaster = master.registerBlockManager( // 向 driver 注册这个 BlockManager,在 BlockManagerMasterEndpoint 这个endPoint中会响应的
id,
maxOnHeapMemory,
maxOffHeapMemory,
slaveEndpoint)
blockManagerId = if (idFromMaster != null) idFromMaster else id // 包含 机器的 主机 host port 等信息
shuffleServerId = if (externalShuffleServiceEnabled) {
logInfo(s"external shuffle service port = $externalShuffleServicePort")
BlockManagerId(executorId, blockTransferService.hostName, externalShuffleServicePort)
} else {
blockManagerId
}
// Register Executors' configuration with the local shuffle service, if one should exist.
if (externalShuffleServiceEnabled && !blockManagerId.isDriver) { //一般 不会
registerWithExternalShuffleServer()
}
logInfo(s"Initialized BlockManager: $blockManagerId")
}
def shuffleMetricsSource: Source = {
import BlockManager._
if (externalShuffleServiceEnabled) {
new ShuffleMetricsSource("ExternalShuffle", shuffleClient.shuffleMetrics())
} else {
new ShuffleMetricsSource("NettyBlockTransfer", shuffleClient.shuffleMetrics())
}
}
private def registerWithExternalShuffleServer() {
logInfo("Registering executor with local external shuffle service.")
val shuffleConfig = new ExecutorShuffleInfo(
diskBlockManager.localDirs.map(_.toString),
diskBlockManager.subDirsPerLocalDir,
shuffleManager.getClass.getName)
val MAX_ATTEMPTS = conf.get(config.SHUFFLE_REGISTRATION_MAX_ATTEMPTS)
val SLEEP_TIME_SECS = 5
for (i <- 1 to MAX_ATTEMPTS) {
try {
// Synchronous and will throw an exception if we cannot connect.
shuffleClient.asInstanceOf[ExternalShuffleClient].registerWithShuffleServer(
shuffleServerId.host, shuffleServerId.port, shuffleServerId.executorId, shuffleConfig)
return
} catch {
case e: Exception if i < MAX_ATTEMPTS =>
logError(s"Failed to connect to external shuffle server, will retry ${MAX_ATTEMPTS - i}"
+ s" more times after waiting $SLEEP_TIME_SECS seconds...", e)
Thread.sleep(SLEEP_TIME_SECS * 1000L)
case NonFatal(e) =>
throw new SparkException("Unable to register with external shuffle server due to : " +
e.getMessage, e)
}
}
}
/**
* Report all blocks to the BlockManager again. This may be necessary if we are dropped
* by the BlockManager and come back or if we become capable of recovering blocks on disk after
* an executor crash.
*
* This function deliberately fails silently if the master returns false (indicating that
* the slave needs to re-register). The error condition will be detected again by the next
* heart beat attempt or new block registration and another try to re-register all blocks
* will be made then.
*/
//向 driver 报告本 节点 的 blockInfoManager 中的所有的 block
private def reportAllBlocks(): Unit = {
logInfo(s"Reporting ${blockInfoManager.size} blocks to the master.")
for ((blockId, info) <- blockInfoManager.entries) {
val status = getCurrentBlockStatus(blockId, info)
if (info.tellMaster && !tryToReportBlockStatus(blockId, status)) {
logError(s"Failed to report $blockId to master; giving up.")
return
}
}
}
/**
* Re-register with the master and report all blocks to it. This will be called by the heart beat
* thread if our heartbeat to the block manager indicates that we were not registered.
*
* Note that this method must be called without any BlockInfo locks held.
*/
// 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
def reregister(): Unit = {
// TODO: We might need to rate limit re-registering.
logInfo(s"BlockManager $blockManagerId re-registering with master")
master.registerBlockManager(blockManagerId, maxOnHeapMemory, maxOffHeapMemory, slaveEndpoint) //blockManagerId 里面有 host port
reportAllBlocks() //向 driver 报告本 节点 的 blockInfoManager 中的所有的 block
}
/**
* Re-register with the master sometime soon.
*/
// 异步 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
private def asyncReregister(): Unit = {
asyncReregisterLock.synchronized {
if (asyncReregisterTask == null) {
asyncReregisterTask = Future[Unit] {
// This is a blocking action and should run in futureExecutionContext which is a cached
// thread pool
reregister() // 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
asyncReregisterLock.synchronized {
asyncReregisterTask = null
}
}(futureExecutionContext)
}
}
}
/**
* For testing. Wait for any pending asynchronous re-registration; otherwise, do nothing.
*/
// 等待 异步 注册BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block 的 任务执行完成
def waitForAsyncReregister(): Unit = {
val task = asyncReregisterTask
if (task != null) {
try {
ThreadUtils.awaitReady(task, Duration.Inf)
} catch {
case NonFatal(t) =>
throw new Exception("Error occurred while waiting for async. reregistration", t)
}
}
}
/**
* Interface to get local block data. Throws an exception if the block cannot be found or
* cannot be read successfully.
*/
//通过 blockId 获取 BlockData
override def getBlockData(blockId: BlockId): ManagedBuffer = {
if (blockId.isShuffle) {//如果是 shuffleBlock,那么将会从 shuffleManager中 获取数据,因为shuffle 是 跨节点的
shuffleManager.shuffleBlockResolver.getBlockData(blockId.asInstanceOf[ShuffleBlockId])
} else {
getLocalBytes(blockId) match {
case Some(blockData) =>
new BlockManagerManagedBuffer(blockInfoManager, blockId, blockData, true)
case None =>
// If this block manager receives a request for a block that it doesn't have then it's
// likely that the master has outdated block statuses for this block. Therefore, we send
// an RPC so that this block is marked as being unavailable from this block manager.
reportBlockStatus(blockId, BlockStatus.empty)
throw new BlockNotFoundException(blockId.toString)
}
}
}
/**
* Put the block locally, using the given storage level.
*
* '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
* so may corrupt or change the data stored by the `BlockManager`.
*/
// 写 block data 数据
override def putBlockData(
blockId: BlockId,
data: ManagedBuffer,
level: StorageLevel,
classTag: ClassTag[_]): Boolean = {
putBytes(blockId, new ChunkedByteBuffer(data.nioByteBuffer()), level)(classTag)
}
/**
* Get the BlockStatus for the block identified by the given ID, if it exists.
* NOTE: This is mainly for testing.
*/
//获取这个 blockId 的 BlockStatus
def getStatus(blockId: BlockId): Option[BlockStatus] = {
blockInfoManager.get(blockId).map { info =>
val memSize = if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L
val diskSize = if (diskStore.contains(blockId)) diskStore.getSize(blockId) else 0L
BlockStatus(info.level, memSize = memSize, diskSize = diskSize)
}
}
/**
* Get the ids of existing blocks that match the given filter. Note that this will
* query the blocks stored in the disk block manager (that the block manager
* may not know of).
*/
//filter 一定的 BlockIds
def getMatchingBlockIds(filter: BlockId => Boolean): Seq[BlockId] = {
// The `toArray` is necessary here in order to force the list to be materialized so that we
// don't try to serialize a lazy iterator when responding to client requests.
(blockInfoManager.entries.map(_._1) ++ diskBlockManager.getAllBlocks())
.filter(filter)
.toArray
.toSeq
}
/**
* Tell the master about the current storage status of a block. This will send a block update
* message reflecting the current status, *not* the desired storage level in its block info.
* For example, a block with MEMORY_AND_DISK set might have fallen out to be only on disk.
*
* droppedMemorySize exists to account for when the block is dropped from memory to disk (so
* it is still valid). This ensures that update in master will compensate for the increase in
* memory on slave.
*/
//报告 Block 状态 如果driver 以前没有注册这个 blockManagerId,也会 异步 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
private def reportBlockStatus(
blockId: BlockId,
status: BlockStatus,
droppedMemorySize: Long = 0L): Unit = {
//tryToReportBlockStatus 返回时true的话, 说明这个 blockManagerId 以前已经报告过了
val needReregister = !tryToReportBlockStatus(blockId, status, droppedMemorySize) //向driver 发送 更新 BlockStatus 信息 包括 storageLevel,inMemSize,onDiskSize
if (needReregister) { //这里的意思就是 以前没有 报告过 这个 blockManagerId 所以需要重新 报告
logInfo(s"Got told to re-register updating block $blockId")
// Re-registering will report our new block for free.
asyncReregister() // 异步 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
}
logDebug(s"Told master about block $blockId")
}
/**
* Actually send a UpdateBlockInfo message. Returns the master's response,
* which will be true if the block was successfully recorded and false if
* the slave needs to re-register.
*/
//向driver 发送 更新 BlockStatus 信息 包括 storageLevel,inMemSize,onDiskSize
private def tryToReportBlockStatus(
blockId: BlockId,
status: BlockStatus,
droppedMemorySize: Long = 0L): Boolean = {
val storageLevel = status.storageLevel
val inMemSize = Math.max(status.memSize, droppedMemorySize)
val onDiskSize = status.diskSize
//向driver 发送 更新 BlockInfo 信息,blockManagerId 里面有位置 host port 等信息
master.updateBlockInfo(blockManagerId, blockId, storageLevel, inMemSize, onDiskSize)
}
/**
* Return the updated storage status of the block with the given ID. More specifically, if
* the block is dropped from memory and possibly added to disk, return the new storage level
* and the updated in-memory and on-disk sizes.
*/
//获取当前 blockId 的状态
private def getCurrentBlockStatus(blockId: BlockId, info: BlockInfo): BlockStatus = {
info.synchronized {
info.level match {
case null =>
BlockStatus.empty
case level =>
val inMem = level.useMemory && memoryStore.contains(blockId)
val onDisk = level.useDisk && diskStore.contains(blockId)
val deserialized = if (inMem) level.deserialized else false
val replication = if (inMem || onDisk) level.replication else 1
val storageLevel = StorageLevel(
useDisk = onDisk,
useMemory = inMem,
useOffHeap = level.useOffHeap,
deserialized = deserialized,
replication = replication)
val memSize = if (inMem) memoryStore.getSize(blockId) else 0L
val diskSize = if (onDisk) diskStore.getSize(blockId) else 0L
BlockStatus(storageLevel, memSize, diskSize)
}
}
}
/**
* Get locations of an array of blocks.
*/
//向BlockManagerMasterEndPoint 获取 blockIds 在那些 BlockManagerId 的序列
private def getLocationBlockIds(blockIds: Array[BlockId]): Array[Seq[BlockManagerId]] = {
val startTimeMs = System.currentTimeMillis
val locations: Array[Seq[BlockManagerId]] = master.getLocations(blockIds).toArray //向BlockManagerMasterEndPoint 获取 blockIds 在那些 BlockManagerId 的序列
logDebug("Got multiple block location in %s".format(Utils.getUsedTimeMs(startTimeMs)))
locations
}
/**
* Cleanup code run in response to a failed local read.
* Must be called while holding a read lock on the block.
*/
//当 read block 没有读到 后的 处理 block
private def handleLocalReadFailure(blockId: BlockId): Nothing = {
releaseLock(blockId)
// Remove the missing block so that its unavailability is reported to the driver
removeBlock(blockId)
throw new SparkException(s"Block $blockId was not found even though it's read-locked")
}
/**
* Get block from local block manager as an iterator of Java objects.
*/
//从本地 blockManager 中获取 blockId data
def getLocalValues(blockId: BlockId): Option[BlockResult] = {
logDebug(s"Getting local block $blockId")
blockInfoManager.lockForReading(blockId) match {//锁读得到 blockInfo信息
case None => //不存在这个 blockId
logDebug(s"Block $blockId was not found")
None
case Some(info) =>
val level: StorageLevel = info.level
logDebug(s"Level for block $blockId is $level")
val taskAttemptId = Option(TaskContext.get()).map(_.taskAttemptId())
if (level.useMemory && memoryStore.contains(blockId)) {//从内存里面获取
val iter: Iterator[Any] = if (level.deserialized) {//非序列化的数据
memoryStore.getValues(blockId).get
} else {//序列化的数据
serializerManager.dataDeserializeStream(
blockId, memoryStore.getBytes(blockId).get.toInputStream())(info.classTag)
}
// We need to capture the current taskId in case the iterator completion is triggered
// from a different thread which does not have TaskContext set; see SPARK-18406 for
// discussion.
val ci: CompletionIterator[Any, Iterator[Any]] = CompletionIterator[Any, Iterator[Any]](iter, {
releaseLock(blockId, taskAttemptId)
}) //释放读锁
Some(new BlockResult(ci, DataReadMethod.Memory, info.size))
} else if (level.useDisk && diskStore.contains(blockId)) {//从内存里面获取
val diskData = diskStore.getBytes(blockId)
val iterToReturn: Iterator[Any] = {
if (level.deserialized) {
val diskValues = serializerManager.dataDeserializeStream(
blockId,
diskData.toInputStream())(info.classTag)
maybeCacheDiskValuesInMemory(info, blockId, level, diskValues)
} else {
val stream = maybeCacheDiskBytesInMemory(info, blockId, level, diskData)
.map { _.toInputStream(dispose = false) }
.getOrElse { diskData.toInputStream() }
serializerManager.dataDeserializeStream(blockId, stream)(info.classTag)
}
}
val ci = CompletionIterator[Any, Iterator[Any]](iterToReturn, {
releaseLockAndDispose(blockId, diskData, taskAttemptId)
})//释放读锁
Some(new BlockResult(ci, DataReadMethod.Disk, info.size))
} else {
handleLocalReadFailure(blockId)
}
}
}
/**
* Get block from the local block manager as serialized bytes.
*/
//从本地 blockManager 中获取 blockId data 的 serialized bytes.
def getLocalBytes(blockId: BlockId): Option[BlockData] = {
logDebug(s"Getting local block $blockId as bytes")
// As an optimization for map output fetches, if the block is for a shuffle, return it
// without acquiring a lock; the disk store never deletes (recent) items so this should work
if (blockId.isShuffle) { //如果是 shuffleBlock,那么将会从 shuffleManager中 获取数据,因为shuffle 是 跨节点的
val shuffleBlockResolver = shuffleManager.shuffleBlockResolver
// TODO: This should gracefully handle case where local block is not available. Currently
// downstream code will throw an exception.
val buf = new ChunkedByteBuffer(
shuffleBlockResolver.getBlockData(blockId.asInstanceOf[ShuffleBlockId]).nioByteBuffer())
Some(new ByteBufferBlockData(buf, true))
} else {//先锁读,拿到 blockInfo,此时没有释放 这个读锁
blockInfoManager.lockForReading(blockId).map { info => doGetLocalBytes(blockId, info) }
}
}
/**
* Get block from the local block manager as serialized bytes.
*
* Must be called while holding a read lock on the block.
* Releases the read lock upon exception; keeps the read lock upon successful return.
*/
//从 本地的 blockManager 中 获取 block data数据
private def doGetLocalBytes(blockId: BlockId, info: BlockInfo): BlockData = {
val level = info.level
logDebug(s"Level for block $blockId is $level")
// In order, try to read the serialized bytes from memory, then from disk, then fall back to
// serializing in-memory objects, and, finally, throw an exception if the block does not exist.
if (level.deserialized) {//非序列化的数据
// Try to avoid expensive serialization by reading a pre-serialized copy from disk:
if (level.useDisk && diskStore.contains(blockId)) {//从磁盘中读取
// Note: we purposely do not try to put the block back into memory here. Since this branch
// handles deserialized blocks, this block may only be cached in memory as objects, not
// serialized bytes. Because the caller only requested bytes, it doesn't make sense to
// cache the block's deserialized objects since that caching may not have a payoff.
diskStore.getBytes(blockId)
} else if (level.useMemory && memoryStore.contains(blockId)) { // 从内存中获取
// The block was not found on disk, so serialize an in-memory copy:
new ByteBufferBlockData(serializerManager.dataSerializeWithExplicitClassTag(
blockId, memoryStore.getValues(blockId).get, info.classTag), true)
} else { //否则 清理这个不存在的 block 缓存,并抛出异常
handleLocalReadFailure(blockId)
}
} else { // storage level is serialized 序列化的数据
if (level.useMemory && memoryStore.contains(blockId)) { // 从内存中获取
new ByteBufferBlockData(memoryStore.getBytes(blockId).get, false)
} else if (level.useDisk && diskStore.contains(blockId)) {//从磁盘中读取
val diskData = diskStore.getBytes(blockId)
maybeCacheDiskBytesInMemory(info, blockId, level, diskData)
.map(new ByteBufferBlockData(_, false))
.getOrElse(diskData)
} else {//否则 清理这个不存在的 block 缓存,并抛出异常
handleLocalReadFailure(blockId)
}
}
}
/**
* Get block from remote block managers.
*
* This does not acquire a lock on this block in this JVM.
*/
private def getRemoteValues[T: ClassTag](blockId: BlockId): Option[BlockResult] = {
val ct: ClassTag[T] = implicitly[ClassTag[T]]
getRemoteBytes(blockId).map { data =>
val values =
serializerManager.dataDeserializeStream(blockId, data.toInputStream(dispose = true))(ct)
new BlockResult(values, DataReadMethod.Network, data.size)
}
}
/**
* Return a list of locations for the given block, prioritizing the local machine since
* multiple block managers can share the same host, followed by hosts on the same rack.
*/
private def sortLocations(locations: Seq[BlockManagerId]): Seq[BlockManagerId] = {
val locs = Random.shuffle(locations)
val (preferredLocs, otherLocs) = locs.partition { loc => blockManagerId.host == loc.host }
blockManagerId.topologyInfo match {
case None => preferredLocs ++ otherLocs
case Some(_) =>
val (sameRackLocs, differentRackLocs) = otherLocs.partition {
loc => blockManagerId.topologyInfo == loc.topologyInfo
}
preferredLocs ++ sameRackLocs ++ differentRackLocs
}
}
/**
* Get block from remote block managers as serialized bytes.
*/
// 先从 driver 的 blockManangerMasterEndpoint 上获取 缓存这个block的节点信息,再从这些节点获取 block data
def getRemoteBytes(blockId: BlockId): Option[ChunkedByteBuffer] = {
logDebug(s"Getting remote block $blockId")
require(blockId != null, "BlockId is null")
var runningFailureCount = 0
var totalFailureCount = 0
// Because all the remote blocks are registered in driver, it is not necessary to ask
// all the slave executors to get block status.
val locationsAndStatus: Option[BlockManagerMessages.BlockLocationsAndStatus] = master.getLocationsAndStatus(blockId) //从driver BlockManagerMasterEndpoint中 获取这个 block的位置和状态
val blockSize: Long = locationsAndStatus.map { b =>
b.status.diskSize.max(b.status.memSize)
}.getOrElse(0L) //求在磁盘和内存量中的最大的值
val blockLocations: Seq[BlockManagerId] = locationsAndStatus.map(_.locations).getOrElse(Seq.empty) //拿到位置信息
// If the block size is above the threshold, we should pass our FileManger to
// BlockTransferService, which will leverage it to spill the block; if not, then passed-in
// null value means the block will be persisted in memory.
val tempFileManager = if (blockSize > maxRemoteBlockToMem) { //size 超过一定的大小 将会使用文件下载的方式方式 获取
remoteBlockTempFileManager
} else {
null
}
val locations = sortLocations(blockLocations) //排序
val maxFetchFailures = locations.size //这个block的副本数
var locationIterator = locations.iterator
while (locationIterator.hasNext) {
val loc = locationIterator.next()
logDebug(s"Getting remote block $blockId from $loc")
val data: ByteBuffer = try { //从目标机器获取数据, 使用 NettyBlockTransferService 的 fetchBlocks 方法
blockTransferService.fetchBlockSync(
loc.host, loc.port, loc.executorId, blockId.toString, tempFileManager).nioByteBuffer()
} catch {
case NonFatal(e) =>
runningFailureCount += 1
totalFailureCount += 1
if (totalFailureCount >= maxFetchFailures) {
// Give up trying anymore locations. Either we've tried all of the original locations,
// or we've refreshed the list of locations from the master, and have still
// hit failures after trying locations from the refreshed list.
logWarning(s"Failed to fetch block after $totalFailureCount fetch failures. " +
s"Most recent failure cause:", e)
return None
}
logWarning(s"Failed to fetch remote block $blockId " +
s"from $loc (failed attempt $runningFailureCount)", e)
// If there is a large number of executors then locations list can contain a
// large number of stale entries causing a large number of retries that may
// take a significant amount of time. To get rid of these stale entries
// we refresh the block locations after a certain number of fetch failures
if (runningFailureCount >= maxFailuresBeforeLocationRefresh) {
locationIterator = sortLocations(master.getLocations(blockId)).iterator
logDebug(s"Refreshed locations from the driver " +
s"after ${runningFailureCount} fetch failures.")
runningFailureCount = 0
}
// This location failed, so we retry fetch from a different one by returning null here
null
}
if (data != null) {//返回数据
return Some(new ChunkedByteBuffer(data))
}
logDebug(s"The value of block $blockId is null")
}
logDebug(s"Block $blockId not found")
None
}
/**
* Get a block from the block manager (either local or remote).
*
* This acquires a read lock on the block if the block was stored locally and does not acquire
* any locks if the block was fetched from a remote block manager. The read lock will
* automatically be freed once the result's `data` iterator is fully consumed.
*/
//先尝试 从 本地的 blockManager 获取 blockId data,再尝试从 其他节点 的 blockManager 获取 blockId data
def get[T: ClassTag](blockId: BlockId): Option[BlockResult] = {
val local = getLocalValues(blockId)
if (local.isDefined) {
logInfo(s"Found block $blockId locally")
return local
}
val remote = getRemoteValues[T](blockId)
if (remote.isDefined) {
logInfo(s"Found block $blockId remotely")
return remote
}
None
}
/**
* Downgrades an exclusive write lock to a shared read lock.
*/
// 锁降级
def downgradeLock(blockId: BlockId): Unit = {
blockInfoManager.downgradeLock(blockId)
}
/**
* Release a lock on the given block with explicit TID.
* The param `taskAttemptId` should be passed in case we can't get the correct TID from
* TaskContext, for example, the input iterator of a cached RDD iterates to the end in a child
* thread.
*/
// 释放锁
def releaseLock(blockId: BlockId, taskAttemptId: Option[Long] = None): Unit = {
blockInfoManager.unlock(blockId, taskAttemptId)
}
/**
* Registers a task with the BlockManager in order to initialize per-task bookkeeping structures.
*/
//在 BlockInfoManager 中注册 task
def registerTask(taskAttemptId: Long): Unit = {
blockInfoManager.registerTask(taskAttemptId)
}
/**
* Release all locks for the given task.
*
* @return the blocks whose locks were released.
*/
// 释放这个 task 的所有的 BlockId
def releaseAllLocksForTask(taskAttemptId: Long): Seq[BlockId] = {
blockInfoManager.releaseAllLocksForTask(taskAttemptId)
}
/**
* Retrieve the given block if it exists, otherwise call the provided `makeIterator` method
* to compute the block, persist it, and return its values.
*
* @return either a BlockResult if the block was successfully cached, or an iterator if the block
* could not be cached.
*/
//先 尝试 全局获取 这个 block data, 否则 更新这个 block data
def getOrElseUpdate[T](
blockId: BlockId,
level: StorageLevel,
classTag: ClassTag[T],
makeIterator: () => Iterator[T]): Either[BlockResult, Iterator[T]] = {
// Attempt to read the block from local or remote storage. If it's present, then we don't need
// to go through the local-get-or-put path.
get[T](blockId)(classTag) match {
case Some(block) =>
return Left(block)
case _ =>
// Need to compute the block.
}
// Initially we hold no locks on this block.
doPutIterator(blockId, makeIterator, level, classTag, keepReadLock = true) match {
case None =>
// doPut() didn't hand work back to us, so the block already existed or was successfully
// stored. Therefore, we now hold a read lock on the block.
val blockResult = getLocalValues(blockId).getOrElse {
// Since we held a read lock between the doPut() and get() calls, the block should not
// have been evicted, so get() not returning the block indicates some internal error.
releaseLock(blockId)
throw new SparkException(s"get() failed for block $blockId even though we held a lock")
}
// We already hold a read lock on the block from the doPut() call and getLocalValues()
// acquires the lock again, so we need to call releaseLock() here so that the net number
// of lock acquisitions is 1 (since the caller will only call release() once).
releaseLock(blockId)
Left(blockResult)
case Some(iter) =>
// The put failed, likely because the data was too large to fit in memory and could not be
// dropped to disk. Therefore, we need to pass the input iterator back to the caller so
// that they can decide what to do with the values (e.g. process them without caching).
Right(iter)
}
}
/**
* @return true if the block was stored or false if an error occurred.
*/
//缓存这个 blockId
def putIterator[T: ClassTag](
blockId: BlockId,
values: Iterator[T],
level: StorageLevel,
tellMaster: Boolean = true): Boolean = {
require(values != null, "Values is null")
doPutIterator(blockId, () => values, level, implicitly[ClassTag[T]], tellMaster) match {
case None =>
true
case Some(iter) =>
// Caller doesn't care about the iterator values, so we can close the iterator here
// to free resources earlier
iter.close()
false
}
}
/**
* A short circuited method to get a block writer that can write data directly to disk.
* The Block will be appended to the File specified by filename. Callers should handle error
* cases.
*/
//获取 disk write
def getDiskWriter(
blockId: BlockId,
file: File,
serializerInstance: SerializerInstance,
bufferSize: Int,
writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
new DiskBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize,
syncWrites, writeMetrics, blockId)
}
/**
* Put a new block of serialized bytes to the block manager.
*
* '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
* so may corrupt or change the data stored by the `BlockManager`.
*
* @return true if the block was stored or false if an error occurred.
*/
// 写 block data 数据
def putBytes[T: ClassTag](
blockId: BlockId,
bytes: ChunkedByteBuffer,
level: StorageLevel,
tellMaster: Boolean = true): Boolean = {
require(bytes != null, "Bytes is null")
doPutBytes(blockId, bytes, level, implicitly[ClassTag[T]], tellMaster)
}
/**
* Put the given bytes according to the given level in one of the block stores, replicating
* the values if necessary.
*
* If the block already exists, this method will not overwrite it.
*
* '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
* so may corrupt or change the data stored by the `BlockManager`.
*
* @param keepReadLock if true, this method will hold the read lock when it returns (even if the
* block already exists). If false, this method will hold no locks when it
* returns.
* @return true if the block was already present or if the put succeeded, false otherwise.
*/
// 写 block data 数据
private def doPutBytes[T](
blockId: BlockId,
bytes: ChunkedByteBuffer,
level: StorageLevel,
classTag: ClassTag[T],
tellMaster: Boolean = true, // keepReadLock 是否需要保持 读锁
keepReadLock: Boolean = false): Boolean = {// doPut 更新 BlockInfoManager的BlockInfo 的读写锁信息, 然后写 block data ,里面
doPut(blockId, level, classTag, tellMaster = tellMaster, keepReadLock = keepReadLock) { info => //这个info 就是 这个 block的这个block 的 读写锁信息
val startTimeMs = System.currentTimeMillis
// Since we're storing bytes, initiate the replication before storing them locally.
// This is faster as data is already serialized and ready to send.
val replicationFuture = if (level.replication > 1) { //副本数
Future {
// This is a blocking action and should run in futureExecutionContext which is a cached
// thread pool. The ByteBufferBlockData wrapper is not disposed of to avoid releasing
// buffers that are owned by the caller.
replicate(blockId, new ByteBufferBlockData(bytes, false), level, classTag)
}(futureExecutionContext)
} else {
null // 以 副本数 为 1 为例
}
val size = bytes.size //block data 的容量大小size
if (level.useMemory) { // 保存到 内存
// Put it in memory first, even if it also has useDisk set to true;
// We will drop it to disk later if the memory store can't hold it.
val putSucceeded = if (level.deserialized) { // 保存 非序列化的数据
val values =
serializerManager.dataDeserializeStream(blockId, bytes.toInputStream())(classTag)
memoryStore.putIteratorAsValues(blockId, values, classTag) match {
case Right(_) => true
case Left(iter) =>
// If putting deserialized values in memory failed, we will put the bytes directly to
// disk, so we don't need this iterator and can close it to free resources earlier.
iter.close()
false
}
} else {// 保存 序列化的数据
val memoryMode = level.memoryMode
memoryStore.putBytes(blockId, size, memoryMode, () => {
if (memoryMode == MemoryMode.OFF_HEAP &&
bytes.chunks.exists(buffer => !buffer.isDirect)) {
bytes.copy(Platform.allocateDirectBuffer)
} else {
bytes
}
})
}
if (!putSucceeded && level.useDisk) { //走到这里的话,是由于内存 保存不了了,然后这个 Storage level 也允许在 磁盘保存数据
logWarning(s"Persisting block $blockId to disk instead.")
diskStore.putBytes(blockId, bytes) // 保存 block data 在磁盘上
}
} else if (level.useDisk) { // 保存到 磁盘
diskStore.putBytes(blockId, bytes) // 保存 block data 在磁盘上
}
val putBlockStatus: BlockStatus = getCurrentBlockStatus(blockId, info)
val blockWasSuccessfullyStored: Boolean = putBlockStatus.storageLevel.isValid
if (blockWasSuccessfullyStored) { // 这里的话 block 已经保存到 内存或者 磁盘了
// Now that the block is in either the memory or disk store,
// tell the master about it.
info.size = size
if (tellMaster && info.tellMaster) { //是否需要 tell blockManager master endPoint
reportBlockStatus(blockId, putBlockStatus) //报告 Block 状态 如果driver 以前没有注册这个 blockManagerId,也会 异步 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
}
addUpdatedBlockStatusToTaskMetrics(blockId, putBlockStatus)
}
logDebug("Put block %s locally took %s".format(blockId, Utils.getUsedTimeMs(startTimeMs)))
if (level.replication > 1) {
// Wait for asynchronous replication to finish
try {
ThreadUtils.awaitReady(replicationFuture, Duration.Inf)
} catch {
case NonFatal(t) =>
throw new Exception("Error occurred while waiting for replication to finish", t)
}
}
if (blockWasSuccessfullyStored) { //是否 block data 全部 stored
None
} else {
Some(bytes)
}
}.isEmpty
}
/**
* Helper method used to abstract common code from [[doPutBytes()]] and [[doPutIterator()]].
*
* @param putBody a function which attempts the actual put() and returns None on success
* or Some on failure.
*/
//写 block data ,里面 会更新 BlockInfoManager的BlockInfo 的读写锁信息
private def doPut[T](
blockId: BlockId,
level: StorageLevel,
classTag: ClassTag[_],
tellMaster: Boolean,
keepReadLock: Boolean)(putBody: BlockInfo => Option[T]): Option[T] = { //keepReadLock 是否继续持有读锁
require(blockId != null, "BlockId is null")
require(level != null && level.isValid, "StorageLevel is null or invalid") //(useMemory || useDisk) && (replication > 0)
// 拿到 BlockInfo 这个block 的 读写锁信息
val putBlockInfo: BlockInfo = {
val newInfo = new BlockInfo(level, classTag, tellMaster) //读写锁 次数统计
if (blockInfoManager.lockNewBlockForWriting(blockId, newInfo)) { //申请新的写锁,true的话 申请成功,false 的话 ,说明有读锁存在 当然第一次write数据时候,当然会返回true
newInfo
} else {//此处的话 说明 这个block的数据 已经存在了,不能再 重复 add 它了
logWarning(s"Block $blockId already exists on this machine; not re-adding it")
if (!keepReadLock) { //如果 在 lockNewBlockForWriting 时,已经存在了这个 block, 那么 需要 释放这个读锁
// lockNewBlockForWriting returned a read lock on the existing block, so we must free it:
releaseLock(blockId)
}
return None
}
}
val startTimeMs = System.currentTimeMillis
var exceptionWasThrown: Boolean = true
val result: Option[T] = try {
val res = putBody(putBlockInfo)
exceptionWasThrown = false
if (res.isEmpty) { //res 是 Empty None 的是 表示 这个 block data 全部 stored 了
// the block was successfully stored
if (keepReadLock) { //上面 持有的是 写锁,如果还需要 读锁,则就会 锁降级
blockInfoManager.downgradeLock(blockId)
} else {//负责 释放锁,有写锁写释放写锁,没有写锁会释放读锁
blockInfoManager.unlock(blockId)
}
} else {//这里的话 缓存这个 block 失败
removeBlockInternal(blockId, tellMaster = false) // 移除这个 blockId
logWarning(s"Putting block $blockId failed")
}
res
} catch {
// Since removeBlockInternal may throw exception,
// we should print exception first to show root cause.
case NonFatal(e) =>
logWarning(s"Putting block $blockId failed due to exception $e.")
throw e
} finally {
// This cleanup is performed in a finally block rather than a `catch` to avoid having to
// catch and properly re-throw InterruptedException.
if (exceptionWasThrown) {
// If an exception was thrown then it's possible that the code in `putBody` has already
// notified the master about the availability of this block, so we need to send an update
// to remove this block location.
removeBlockInternal(blockId, tellMaster = tellMaster) // 移除这个 blockId
// The `putBody` code may have also added a new block status to TaskMetrics, so we need
// to cancel that out by overwriting it with an empty block status. We only do this if
// the finally block was entered via an exception because doing this unconditionally would
// cause us to send empty block statuses for every block that failed to be cached due to
// a memory shortage (which is an expected failure, unlike an uncaught exception).
addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty)
}
}
if (level.replication > 1) {
logDebug("Putting block %s with replication took %s"
.format(blockId, Utils.getUsedTimeMs(startTimeMs)))
} else {
logDebug("Putting block %s without replication took %s"
.format(blockId, Utils.getUsedTimeMs(startTimeMs)))
}
result
}
/**
* Put the given block according to the given level in one of the block stores, replicating
* the values if necessary.
*
* If the block already exists, this method will not overwrite it.
*
* @param keepReadLock if true, this method will hold the read lock when it returns (even if the
* block already exists). If false, this method will hold no locks when it
* returns.
* @return None if the block was already present or if the put succeeded, or Some(iterator)
* if the put failed.
*/
//写 block data ,里面 会更新 BlockInfoManager的BlockInfo 的读写锁信息
private def doPutIterator[T](
blockId: BlockId,
iterator: () => Iterator[T],
level: StorageLevel,
classTag: ClassTag[T],
tellMaster: Boolean = true,
keepReadLock: Boolean = false): Option[PartiallyUnrolledIterator[T]] = {
doPut(blockId, level, classTag, tellMaster = tellMaster, keepReadLock = keepReadLock) { info =>
val startTimeMs = System.currentTimeMillis
var iteratorFromFailedMemoryStorePut: Option[PartiallyUnrolledIterator[T]] = None
// Size of the block in bytes
var size = 0L
if (level.useMemory) {
// Put it in memory first, even if it also has useDisk set to true;
// We will drop it to disk later if the memory store can't hold it.
if (level.deserialized) {
memoryStore.putIteratorAsValues(blockId, iterator(), classTag) match {
case Right(s) =>
size = s
case Left(iter) =>
// Not enough space to unroll this block; drop to disk if applicable
if (level.useDisk) {
logWarning(s"Persisting block $blockId to disk instead.")
diskStore.put(blockId) { channel =>
val out = Channels.newOutputStream(channel)
serializerManager.dataSerializeStream(blockId, out, iter)(classTag)
}
size = diskStore.getSize(blockId)
} else {
iteratorFromFailedMemoryStorePut = Some(iter)
}
}
} else { // !level.deserialized
memoryStore.putIteratorAsBytes(blockId, iterator(), classTag, level.memoryMode) match {
case Right(s) =>
size = s
case Left(partiallySerializedValues) =>
// Not enough space to unroll this block; drop to disk if applicable
if (level.useDisk) {
logWarning(s"Persisting block $blockId to disk instead.")
diskStore.put(blockId) { channel =>
val out = Channels.newOutputStream(channel)
partiallySerializedValues.finishWritingToStream(out)
}
size = diskStore.getSize(blockId)
} else {
iteratorFromFailedMemoryStorePut = Some(partiallySerializedValues.valuesIterator)
}
}
}
} else if (level.useDisk) {
diskStore.put(blockId) { channel =>
val out = Channels.newOutputStream(channel)
serializerManager.dataSerializeStream(blockId, out, iterator())(classTag)
}
size = diskStore.getSize(blockId)
}
val putBlockStatus = getCurrentBlockStatus(blockId, info)
val blockWasSuccessfullyStored = putBlockStatus.storageLevel.isValid
if (blockWasSuccessfullyStored) {
// Now that the block is in either the memory or disk store, tell the master about it.
info.size = size
if (tellMaster && info.tellMaster) {
reportBlockStatus(blockId, putBlockStatus)
}
addUpdatedBlockStatusToTaskMetrics(blockId, putBlockStatus)
logDebug("Put block %s locally took %s".format(blockId, Utils.getUsedTimeMs(startTimeMs)))
if (level.replication > 1) {
val remoteStartTime = System.currentTimeMillis
val bytesToReplicate = doGetLocalBytes(blockId, info)
// [SPARK-16550] Erase the typed classTag when using default serialization, since
// NettyBlockRpcServer crashes when deserializing repl-defined classes.
// TODO(ekl) remove this once the classloader issue on the remote end is fixed.
val remoteClassTag = if (!serializerManager.canUseKryo(classTag)) {
scala.reflect.classTag[Any]
} else {
classTag
}
try {
replicate(blockId, bytesToReplicate, level, remoteClassTag)
} finally {
bytesToReplicate.dispose()
}
logDebug("Put block %s remotely took %s"
.format(blockId, Utils.getUsedTimeMs(remoteStartTime)))
}
}
assert(blockWasSuccessfullyStored == iteratorFromFailedMemoryStorePut.isEmpty)
iteratorFromFailedMemoryStorePut
}
}
/**
* Attempts to cache spilled bytes read from disk into the MemoryStore in order to speed up
* subsequent reads. This method requires the caller to hold a read lock on the block.
*
* @return a copy of the bytes from the memory store if the put succeeded, otherwise None.
* If this returns bytes from the memory store then the original disk store bytes will
* automatically be disposed and the caller should not continue to use them. Otherwise,
* if this returns None then the original disk store bytes will be unaffected.
*/
//可能的话 在内存中 缓存 这个block data
private def maybeCacheDiskBytesInMemory(
blockInfo: BlockInfo,
blockId: BlockId,
level: StorageLevel,
diskData: BlockData): Option[ChunkedByteBuffer] = {
require(!level.deserialized)
if (level.useMemory) {//可能的话 在内存中 缓存 这个block data
// Synchronize on blockInfo to guard against a race condition where two readers both try to
// put values read from disk into the MemoryStore.
blockInfo.synchronized {
if (memoryStore.contains(blockId)) {
diskData.dispose()
Some(memoryStore.getBytes(blockId).get)
} else {
val allocator = level.memoryMode match {
case MemoryMode.ON_HEAP => ByteBuffer.allocate _
case MemoryMode.OFF_HEAP => Platform.allocateDirectBuffer _
}
val putSucceeded = memoryStore.putBytes(blockId, diskData.size, level.memoryMode, () => {
// https://issues.apache.org/jira/browse/SPARK-6076
// If the file size is bigger than the free memory, OOM will happen. So if we
// cannot put it into MemoryStore, copyForMemory should not be created. That's why
// this action is put into a `() => ChunkedByteBuffer` and created lazily.
diskData.toChunkedByteBuffer(allocator)
})
if (putSucceeded) {
diskData.dispose()
Some(memoryStore.getBytes(blockId).get)
} else {
None
}
}
}
} else {
None
}
}
/**
* Attempts to cache spilled values read from disk into the MemoryStore in order to speed up
* subsequent reads. This method requires the caller to hold a read lock on the block.
*
* @return a copy of the iterator. The original iterator passed this method should no longer
* be used after this method returns.
*/
// 尽量 缓存 磁盘的 数据 到 内存
private def maybeCacheDiskValuesInMemory[T](
blockInfo: BlockInfo,
blockId: BlockId,
level: StorageLevel,
diskIterator: Iterator[T]): Iterator[T] = {
require(level.deserialized)
val classTag = blockInfo.classTag.asInstanceOf[ClassTag[T]]
if (level.useMemory) {
// Synchronize on blockInfo to guard against a race condition where two readers both try to
// put values read from disk into the MemoryStore.
blockInfo.synchronized {
if (memoryStore.contains(blockId)) {
// Note: if we had a means to discard the disk iterator, we would do that here.
memoryStore.getValues(blockId).get
} else {
memoryStore.putIteratorAsValues(blockId, diskIterator, classTag) match {
case Left(iter) =>
// The memory store put() failed, so it returned the iterator back to us:
iter
case Right(_) =>
// The put() succeeded, so we can read the values back:
memoryStore.getValues(blockId).get
}
}
}.asInstanceOf[Iterator[T]]
} else {
diskIterator
}
}
/**
* Get peer block managers in the system.
*/
//取得 全局 blockManagerInfo 中的 不在 driver上 和 本 blockManagerId 的其他的 BlockManager BlockManagerId信息
private def getPeers(forceFetch: Boolean): Seq[BlockManagerId] = {
peerFetchLock.synchronized {
val cachedPeersTtl = conf.getInt("spark.storage.cachedPeersTtl", 60 * 1000) // milliseconds
val timeout = System.currentTimeMillis - lastPeerFetchTime > cachedPeersTtl
if (cachedPeers == null || forceFetch || timeout) {
cachedPeers = master.getPeers(blockManagerId).sortBy(_.hashCode) //取得 全局 blockManagerInfo 中的 不在 driver上 和 本 blockManagerId 的其他的 BlockManager 信息
lastPeerFetchTime = System.currentTimeMillis
logDebug("Fetched peers from master: " + cachedPeers.mkString("[", ",", "]"))
}
cachedPeers
}
}
/**
* Called for pro-active replenishment of blocks lost due to executor failures
*
* @param blockId blockId being replicate
* @param existingReplicas existing block managers that have a replica
* @param maxReplicas maximum replicas needed
*/
// 复制 一份BlockData 到 其他的节点,非 driver
def replicateBlock(
blockId: BlockId,
existingReplicas: Set[BlockManagerId], //已经在 那些个 BlockManagerId 上缓存了,就不会再 这些上面 缓存了
maxReplicas: Int): Unit = {
logInfo(s"Using $blockManagerId to pro-actively replicate $blockId")
blockInfoManager.lockForReading(blockId).foreach { info =>
val data = doGetLocalBytes(blockId, info)
val storageLevel = StorageLevel(
useDisk = info.level.useDisk,
useMemory = info.level.useMemory,
useOffHeap = info.level.useOffHeap,
deserialized = info.level.deserialized,
replication = maxReplicas)
// we know we are called as a result of an executor removal, so we refresh peer cache
// this way, we won't try to replicate to a missing executor with a stale reference
getPeers(forceFetch = true)
try {
replicate(blockId, data, storageLevel, info.classTag, existingReplicas)
} finally {
logDebug(s"Releasing lock for $blockId")
releaseLockAndDispose(blockId, data)
}
}
}
/**
* Replicate block to another node. Note that this is a blocking call that returns after
* the block has been replicated.
*/
// 复制 一份BlockData 到 其他的节点,非 driver, 使用 NettyBlockTransferService 的 uploadBlock 功能
private def replicate(
blockId: BlockId,
data: BlockData,
level: StorageLevel,
classTag: ClassTag[_],
existingReplicas: Set[BlockManagerId] = Set.empty): Unit = { //已经在 那些个 BlockManagerId 上缓存了,就不会再 这些上面 缓存了
val maxReplicationFailures = conf.getInt("spark.storage.maxReplicationFailures", 1)
val tLevel: StorageLevel = StorageLevel(
useDisk = level.useDisk,
useMemory = level.useMemory,
useOffHeap = level.useOffHeap,
deserialized = level.deserialized,
replication = 1)
val numPeersToReplicateTo = level.replication - 1
val startTime = System.nanoTime
val peersReplicatedTo = mutable.HashSet.empty ++ existingReplicas
val peersFailedToReplicateTo = mutable.HashSet.empty[BlockManagerId]
var numFailures = 0
val initialPeers = getPeers(false).filterNot(existingReplicas.contains) //这里过滤掉 已经缓存过的节点
var peersForReplication = blockReplicationPolicy.prioritize(
blockManagerId,
initialPeers,
peersReplicatedTo,
blockId,
numPeersToReplicateTo)
while(numFailures <= maxReplicationFailures &&
!peersForReplication.isEmpty &&
peersReplicatedTo.size < numPeersToReplicateTo) {
val peer = peersForReplication.head
try {
val onePeerStartTime = System.nanoTime
logTrace(s"Trying to replicate $blockId of ${data.size} bytes to $peer")
blockTransferService.uploadBlockSync(
peer.host,
peer.port,
peer.executorId,
blockId,
new BlockManagerManagedBuffer(blockInfoManager, blockId, data, false),
tLevel,
classTag) // 使用 blockTransferService 上传blockId blockId
logTrace(s"Replicated $blockId of ${data.size} bytes to $peer" +
s" in ${(System.nanoTime - onePeerStartTime).toDouble / 1e6} ms")
peersForReplication = peersForReplication.tail
peersReplicatedTo += peer
} catch {
case NonFatal(e) =>
logWarning(s"Failed to replicate $blockId to $peer, failure #$numFailures", e)
peersFailedToReplicateTo += peer
// we have a failed replication, so we get the list of peers again
// we don't want peers we have already replicated to and the ones that
// have failed previously
val filteredPeers = getPeers(true).filter { p =>
!peersFailedToReplicateTo.contains(p) && !peersReplicatedTo.contains(p)
}
numFailures += 1
peersForReplication = blockReplicationPolicy.prioritize(
blockManagerId,
filteredPeers,
peersReplicatedTo,
blockId,
numPeersToReplicateTo - peersReplicatedTo.size)
}
}
logDebug(s"Replicating $blockId of ${data.size} bytes to " +
s"${peersReplicatedTo.size} peer(s) took ${(System.nanoTime - startTime) / 1e6} ms")
if (peersReplicatedTo.size < numPeersToReplicateTo) {
logWarning(s"Block $blockId replicated to only " +
s"${peersReplicatedTo.size} peer(s) instead of $numPeersToReplicateTo peers")
}
logDebug(s"block $blockId replicated to ${peersReplicatedTo.mkString(", ")}")
}
/**
* Read a block consisting of a single object.
*/
//获取 block data
def getSingle[T: ClassTag](blockId: BlockId): Option[T] = {
get[T](blockId).map(_.data.next().asInstanceOf[T])
}
/**
* Write a block consisting of a single object.
*
* @return true if the block was stored or false if the block was already stored or an
* error occurred.
*/
// put 一个 对象
def putSingle[T: ClassTag](
blockId: BlockId,
value: T,
level: StorageLevel,
tellMaster: Boolean = true): Boolean = {
putIterator(blockId, Iterator(value), level, tellMaster)
}
/**
* Drop a block from memory, possibly putting it on disk if applicable. Called when the memory
* store reaches its limit and needs to free up space.
*
* If `data` is not put on disk, it won't be created.
*
* The caller of this method must hold a write lock on the block before calling this method.
* This method does not release the write lock.
*
* @return the block's new effective StorageLevel.
*/
//释放这个 blockId 的数据
private[storage] override def dropFromMemory[T: ClassTag](
blockId: BlockId,
data: () => Either[Array[T], ChunkedByteBuffer]): StorageLevel = {
logInfo(s"Dropping block $blockId from memory")
val info: BlockInfo = blockInfoManager.assertBlockIsLockedForWriting(blockId)
var blockIsUpdated = false
val level = info.level //StorageLevel
// Drop to disk, if storage level requires
if (level.useDisk && !diskStore.contains(blockId)) { //如果使用的是 磁盘,且 磁盘中 没有 保存过,
logInfo(s"Writing block $blockId to disk")
data() match {
case Left(elements) => //这里写入 的 是 序列化的数据 到 磁盘
diskStore.put(blockId) { channel =>
val out = Channels.newOutputStream(channel)
serializerManager.dataSerializeStream(
blockId,
out,
elements.toIterator)(info.classTag.asInstanceOf[ClassTag[T]])
}
case Right(bytes) => //写入 非序列化的数据 到 磁盘
diskStore.putBytes(blockId, bytes)
}
blockIsUpdated = true
}
// Actually drop from memory store
val droppedMemorySize =
if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L
val blockIsRemoved = memoryStore.remove(blockId) //如果是 内存缓存 则移除
if (blockIsRemoved) {
blockIsUpdated = true
} else {
logWarning(s"Block $blockId could not be dropped from memory as it does not exist")
}
val status: BlockStatus = getCurrentBlockStatus(blockId, info) //可能为 BlockStatus.empty BlockStatus里面会有 缓存在 磁盘还是 内存 以及 size内存量
if (info.tellMaster) {
//报告 Block 状态 如果driver 以前没有注册这个 blockManagerId,也会 异步 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
reportBlockStatus(blockId, status, droppedMemorySize)
}
if (blockIsUpdated) {
addUpdatedBlockStatusToTaskMetrics(blockId, status)
}
status.storageLevel
}
/**
* Remove all blocks belonging to the given RDD.
*
* @return The number of blocks removed.
*/
//移除 这个RDD 的所有的 blocks
def removeRdd(rddId: Int): Int = {
// TODO: Avoid a linear scan by creating another mapping of RDD.id to blocks.
logInfo(s"Removing RDD $rddId")
val blocksToRemove: Iterator[RDDBlockId] = blockInfoManager.entries.flatMap(_._1.asRDDId).filter(_.rddId == rddId) //过滤得到这个 rddid 的所有的 blockid
blocksToRemove.foreach { blockId => removeBlock(blockId, tellMaster = false) } //依次移除 这些 blockid
blocksToRemove.size
}
/**
* Remove all blocks belonging to the given broadcast.
*/
//移除一个 广播,即移除属于这个广播的所有的blocks
def removeBroadcast(broadcastId: Long, tellMaster: Boolean): Int = {
logDebug(s"Removing broadcast $broadcastId")
val blocksToRemove: Iterator[BroadcastBlockId] = blockInfoManager.entries.map(_._1).collect { //找出 blockInfoManager 里面所有的 这个底 是 broadcastId的 BroadcastBlockId 的数据
//为什么 会有多个 应为 这个是一个 map map的key 是case对象的 属性的hashcode 与 equals 方法 来判断是否为同一个对象
case bid @ BroadcastBlockId(`broadcastId`, _) => bid
}
blocksToRemove.foreach { blockId => removeBlock(blockId, tellMaster) } //依次 移除他们
blocksToRemove.size
}
/**
* Remove a block from both memory and disk.
*/
//移除一个 block 从内存和 磁盘
def removeBlock(blockId: BlockId, tellMaster: Boolean = true): Unit = {
logDebug(s"Removing block $blockId")
blockInfoManager.lockForWriting(blockId) match { //一直尝试 拿到 BlockInfo 锁读
case None => //如果为 None 则警告
// The block has already been removed; do nothing.
logWarning(s"Asked to remove block $blockId, which does not exist")
case Some(info) => // // 移除这个 blockId 从 内存、磁盘、blockInfoManager
removeBlockInternal(blockId, tellMaster = tellMaster && info.tellMaster)
addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty)
}
}
/**
* Internal version of [[removeBlock()]] which assumes that the caller already holds a write
* lock on the block.
*/
// 移除这个 blockId 从 内存、磁盘、blockInfoManager
private def removeBlockInternal(blockId: BlockId, tellMaster: Boolean): Unit = {
// Removals are idempotent in disk store and memory store. At worst, we get a warning.
val removedFromMemory = memoryStore.remove(blockId)
val removedFromDisk = diskStore.remove(blockId)
if (!removedFromMemory && !removedFromDisk) {
logWarning(s"Block $blockId could not be removed as it was not found on disk or in memory")
}
blockInfoManager.removeBlock(blockId)
if (tellMaster) {//报告 Block 状态 如果driver 以前没有注册这个 blockManagerId,也会 异步 向 driver 注册 BlockManager 且 报告 本 节点 的 blockInfoManager 中的所有的 block
reportBlockStatus(blockId, BlockStatus.empty)
}
}
private def addUpdatedBlockStatusToTaskMetrics(blockId: BlockId, status: BlockStatus): Unit = {
if (conf.get(config.TASK_METRICS_TRACK_UPDATED_BLOCK_STATUSES)) {
Option(TaskContext.get()).foreach { c =>
c.taskMetrics().incUpdatedBlockStatuses(blockId -> status)
}
}
}
//释放锁 和销毁数据
def releaseLockAndDispose(
blockId: BlockId,
data: BlockData,
taskAttemptId: Option[Long] = None): Unit = {
releaseLock(blockId, taskAttemptId)
data.dispose()
}
def stop(): Unit = {
blockTransferService.close()
if (shuffleClient ne blockTransferService) {
// Closing should be idempotent, but maybe not for the NioBlockTransferService.
shuffleClient.close()
}
remoteBlockTempFileManager.stop()
diskBlockManager.stop()
rpcEnv.stop(slaveEndpoint)
blockInfoManager.clear()
memoryStore.clear()
futureExecutionContext.shutdownNow()
logInfo("BlockManager stopped")
}
}
class BlockManagerSlaveEndpoint
这个类的主要目的在于 响应请求driver等 操作本节点的 block。内部会持有本节点的BlockManager,利用这个BlockManager完成对 driver 对本节点上的 block的处理。
// 这个存在的目的在于 响应请求driver等 操作本节点的 block
private[storage]
class BlockManagerSlaveEndpoint(
override val rpcEnv: RpcEnv,
blockManager: BlockManager,
mapOutputTracker: MapOutputTracker)
extends ThreadSafeRpcEndpoint with Logging {// 这是一个 endPoint
private val asyncThreadPool =
ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool")
private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool)
// Operations that involve removing blocks may be slow and should be done asynchronously
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
case RemoveBlock(blockId) =>
doAsync[Boolean]("removing block " + blockId, context) {
blockManager.removeBlock(blockId)
true
}
case RemoveRdd(rddId) =>
doAsync[Int]("removing RDD " + rddId, context) {
blockManager.removeRdd(rddId)
}
case RemoveShuffle(shuffleId) =>
doAsync[Boolean]("removing shuffle " + shuffleId, context) {
if (mapOutputTracker != null) {
mapOutputTracker.unregisterShuffle(shuffleId)
}
SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId)
}
case RemoveBroadcast(broadcastId, _) =>
doAsync[Int]("removing broadcast " + broadcastId, context) {
blockManager.removeBroadcast(broadcastId, tellMaster = true)
}
case GetBlockStatus(blockId, _) =>
context.reply(blockManager.getStatus(blockId))
case GetMatchingBlockIds(filter, _) =>
context.reply(blockManager.getMatchingBlockIds(filter))
case TriggerThreadDump =>
context.reply(Utils.getThreadDump())
case ReplicateBlock(blockId, replicas, maxReplicas) =>
context.reply(blockManager.replicateBlock(blockId, replicas.toSet, maxReplicas))
}
private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) {
val future = Future {
logDebug(actionMessage)
body
}
future.foreach { response =>
logDebug(s"Done $actionMessage, response is $response")
context.reply(response)
logDebug(s"Sent response: $response to ${context.senderAddress}")
}
future.failed.foreach { t =>
logError(s"Error in $actionMessage", t)
context.sendFailure(t)
}
}
override def onStop(): Unit = {
asyncThreadPool.shutdownNow()
}
}