当前位置: 首页 > news >正文

渗透wordpress论坛seodao cn

渗透wordpress论坛,seodao cn,淘宝网 动态网站 静态网站,网站建设仟首先金手指13当我们在使用spark编写mr作业是,最后都要涉及到调用reduce,foreach或者是count这类action来触发作业的提交,所以,当我们查看这些方法的源码时,发现底层都调用了SparkContext的runJob方法,而SparkContext的runJob方法又…

当我们在使用spark编写mr作业是,最后都要涉及到调用reduce,foreach或者是count这类action来触发作业的提交,所以,当我们查看这些方法的源码时,发现底层都调用了SparkContext的runJob方法,而SparkContext的runJob方法又调用的DAGScheduler的runJob方法:

def runJob[T, U: ClassTag](rdd: RDD[T],func: (TaskContext, Iterator[T]) => U,partitions: Seq[Int],resultHandler: (Int, U) => Unit): Unit = { if (stopped.get()) { throw new IllegalStateException("SparkContext has been shutdown") } val callSite = getCallSite val cleanedFunc = clean(func) logInfo("Starting job: " + callSite.shortForm) if (conf.getBoolean("spark.logLineage", false)) { logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString) } dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, esultHandler, localProperties.get) progressBar.foreach(_.finishAll()) rdd.doCheckpoint() } 

这里以rdd和分区信息和对结果集处理的回调函数为参数进入到:

  def runJob[T, U](rdd: RDD[T],func: (TaskContext, Iterator[T]) => U,partitions: Seq[Int],callSite: CallSite,resultHandler: (Int, U) => Unit,properties: Properties): Unit = {val start = System.nanoTimeval waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)// Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`, // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that // due to idiosyncrasies in Scala, `awaitPermission` is not actually used anywhere so it's // safe to pass in null here. For more detail, see SPARK-13747. val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait] waiter.completionFuture.ready(Duration.Inf)(awaitPermission) waiter.completionFuture.value.get match { case scala.util.Success(_) => logInfo("Job %d finished: %s, took %f s".format (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9)) case scala.util.Failure(exception) => logInfo("Job %d failed: %s, took %f s".format (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9)) // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler. val callerStackTrace = Thread.currentThread().getStackTrace.tail exception.setStackTrace(exception.getStackTrace ++ callerStackTrace) throw exception } } 

然后调用submitJob方法:

  def submitJob[T, U](rdd: RDD[T],func: (TaskContext, Iterator[T]) => U,partitions: Seq[Int],callSite: CallSite,resultHandler: (Int, U) => Unit, properties: Properties): JobWaiter[U] = { // Check to make sure we are not launching a task on a partition that does not exist. val maxPartitions = rdd.partitions.length partitions.find(p => p >= maxPartitions || p < 0).foreach { p => throw new IllegalArgumentException( "Attempting to access a non-existent partition: " + p + ". " + "Total number of partitions: " + maxPartitions) } val jobId = nextJobId.getAndIncrement() if (partitions.size == 0) { // Return immediately if the job is running 0 tasks return new JobWaiter[U](this, jobId, 0, resultHandler) } assert(partitions.size > 0) val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _] val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler) eventProcessLoop.post(JobSubmitted( jobId, rdd, func2, partitions.toArray, callSite, waiter, SerializationUtils.clone(properties))) waiter } 

我们注意到里面有一行eventProcessLoop.post(JobSubmitted(...))的代码,这是向消息队列中放入一个作业提交的消息,由另一个线程来循环从队列中取出消息消费,执行相应的逻辑。我们可以看到在DAGScheduler类定义的最后一行调用了eventProcessLoop.start方法来启动这个时间循环线程。 在另一个线程中,通过scala的case class模式匹配并执行了DAGScheduler的handleJobSubmitted方法,这是一个比较核心的方法,所有生成stage,以及stage之间的依赖关系解析,作业的生成,都是在这里完成的。

private[scheduler] def handleJobSubmitted(jobId: Int,finalRDD: RDD[_],func: (TaskContext, Iterator[_]) => _,partitions: Array[Int], callSite: CallSite, listener: JobListener, properties: Properties) { var finalStage: ResultStage = null try { // New stage creation may throw an exception if, for example, jobs are run on a // HadoopRDD whose underlying HDFS files have been deleted. finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite) } catch { case e: Exception => logWarning("Creating new stage failed due to exception - job: " + jobId, e) listener.jobFailed(e) return } val job = new ActiveJob(jobId, finalStage, callSite, listener, properties) clearCacheLocs() logInfo("Got job %s (%s) with %d output partitions".format( job.jobId, callSite.shortForm, partitions.length)) logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")") logInfo("Parents of final stage: " + finalStage.parents) logInfo("Missing parents: " + getMissingParentStages(finalStage)) val jobSubmissionTime = clock.getTimeMillis() jobIdToActiveJob(jobId) = job activeJobs += job finalStage.setActiveJob(job) val stageIds = jobIdToStageIds(jobId).toArray val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo)) listenerBus.post( SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties)) submitStage(finalStage) submitWaitingStages() } 

首先是在

finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)

这里将stage之间的依赖关系解析出来,同时根据依赖关系从小到大生成stage id。

  private def newResultStage( rdd: RDD[_], func: (TaskContext, Iterator[_]) => _, partitions: Array[Int], jobId: Int, callSite: CallSite): ResultStage = { val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, jobId) val stage = new ResultStage(id, rdd, func, partitions, parentStages, jobId, callSite) stageIdToStage(id) = stage updateJobIdStageIdMaps(jobId, stage) stage } 

getParentStagesAndId:

  private def getParentStagesAndId(rdd: RDD[_], firstJobId: Int): (List[Stage], Int) = { val parentStages = getParentStages(rdd, firstJobId) val id = nextStageId.getAndIncrement() (parentStages, id) } private def getParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = { val parents = new HashSet[Stage] val visited = new HashSet[RDD[_]] // We are manually maintaining a stack here to prevent StackOverflowError // caused by recursively visiting val waitingForVisit = new Stack[RDD[_]] def visit(r: RDD[_]) { if (!visited(r)) { visited += r // Kind of ugly: need to register RDDs with the cache here since // we can't do it in its constructor because # of partitions is unknown for (dep <- r.dependencies) { dep match { case shufDep: ShuffleDependency[_, _, _] => parents += getShuffleMapStage(shufDep, firstJobId) case _ => waitingForVisit.push(dep.rdd) } } } } waitingForVisit.push(rdd) while (waitingForVisit.nonEmpty) { visit(waitingForVisit.pop()) } parents.toList } 

可以看到这里使用栈结构深度依次遍历了每一个rdd的所有依赖,如果是shuffle dependency则生成shuffle stage,其他的依赖则先放到栈里,再依次遍历。这里在生成shuffleMapStage的过程中又会递归的调用getParentStagesAndId方法,所以最后生成的finalStage是一个处于依赖树最顶端的包含其所有依赖的子依赖树的结构,stage id的生成从依赖链最底端,从小到大生成。

之后以finalStage为参数调用submitStage来提交作业,但是在提交的过程中,它会依次递归的解析和提交每个stage所依赖的父stage,最终最先提交的是没有任何依赖的stage。

  private def submitStage(stage: Stage) { val jobId = activeJobForStage(stage) if (jobId.isDefined) { logDebug("submitStage(" + stage + ")") if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) { val missing = getMissingParentStages(stage).sortBy(_.id) logDebug("missing: " + missing) if (missing.isEmpty) { logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents") submitMissingTasks(stage, jobId.get) } else { for (parent <- missing) { submitStage(parent) } waitingStages += stage } } } else { abortStage(stage, "No active job for stage " + stage.id, None) } } 

通过submitMissingTasks提交stage的所有task。在submitMissingTasks方法中, 首先计算task的分发策略,

val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try { stage match { case s: ShuffleMapStage => partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap case s: ResultStage => val job = s.activeJob.get partitionsToCompute.map { id => val p = s.partitions(id) (id, getPreferredLocs(stage.rdd, p)) }.toMap } } catch { case NonFatal(e) => stage.makeNewStageAttempt(partitionsToCompute.size) listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties)) abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e)) runningStages -= stage return } 

然后序列化task,

    var taskBinary: Broadcast[Array[Byte]] = null try { // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep). // For ResultTask, serialize and broadcast (rdd, func). val taskBinaryBytes: Array[Byte] = stage match { case stage: ShuffleMapStage => JavaUtils.bufferToArray( closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef)) case stage: ResultStage => JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef)) } 

将序列化后的task广播出去,

  taskBinary = sc.broadcast(taskBinaryBytes)

然后将tasks信息封装成task对象数组,

val tasks: Seq[Task[_]] = try { stage match { case stage: ShuffleMapStage => partitionsToCompute.map { id => val locs = taskIdToLocations(id) val part = stage.rdd.partitions(id) new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, stage.latestInfo.taskMetrics, properties) } case stage: ResultStage => val job = stage.activeJob.get partitionsToCompute.map { id => val p: Int = stage.partitions(id) val part = stage.rdd.partitions(p) val locs = taskIdToLocations(id) new ResultTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, id, properties, stage.latestInfo.taskMetrics) } } } catch { case NonFatal(e) => abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e)) runningStages -= stage return } 

调用taskScheduler提交task集合

  taskScheduler.submitTasks(new TaskSet( tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties)) 

这个方法里主要是将taskSet交给TaskSetManager去管理,另外比较关键的是调用了schedulableBuilder中的addTaskSetManager,SchedulableBuilder本身是应用程序级别的调度器,它自己支持两种调度模式,一种是FIFO,另一种是FAIR,调度策略可以通过spark-env.sh中的spark.scheduler.mode进行具体的设置,默认情况下是FIFO。最后在submitTasks中调用了

  backend.reviveOffers() 

这里调用了CoarseGrainedSchedulerBackend.reviveOffers给driverEndpoint发送了一个ReviveOffers case object,这个消息其实是发给driverEndpoint自己的(详情见sparkde RpcEnv模块),也就是说最后处理的这个消息的还是driverEndpoint本身。这里会触发driverEndpoint的recieve方法然后路由到makeOffers方法。

    private def makeOffers() { // Filter out executors under killing val activeExecutors = executorDataMap.filterKeys(executorIsAlive) val workOffers = activeExecutors.map { case (id, executorData) => new WorkerOffer(id, executorData.executorHost, executorData.freeCores) }.toSeq launchTasks(scheduler.resourceOffers(workOffers)) } 

在makeOffers方法中,首先准备好所有可以用于计算的Executor,然后找出可以的workOffers(代表了所有可用ExecutorBackend中可以使用的CPU Cores信息)WorkerOffer会告我们具体Executor可用的资源。而确定task具体运行在哪个ExecutorBackend上的算法是有TaskSetManager的resourceOffers方法决定的,具体算法我们后续讨论。再通过调用launchTask把任务发送给ExecutorBackend去执行。代码如下:

    private def launchTasks(tasks: Seq[Seq[TaskDescription]]) { for (task <- tasks.flatten) { val serializedTask = ser.serialize(task) if (serializedTask.limit >= maxRpcMessageSize) { scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr => try { var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " + "spark.rpc.message.maxSize (%d bytes). Consider increasing " + "spark.rpc.message.maxSize or using broadcast variables for large values." msg = msg.format(task.taskId, task.index, serializedTask.limit, maxRpcMessageSize) taskSetMgr.abort(msg) } catch { case e: Exception => logError("Exception in error callback", e) } } } else { val executorData = executorDataMap(task.executorId) executorData.freeCores -= scheduler.CPUS_PER_TASK logInfo(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " + s"${executorData.executorHost}.") executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) } } } 

可以看到这里有将序列化后的task发送给executor的逻辑,所以整体的提交作业到这里就结束了

出自:https://my.oschina.net/nalenwind/blog/1786172

转载于:https://www.cnblogs.com/hd-zg/p/8724597.html

http://www.lbrq.cn/news/2381329.html

相关文章:

  • 2018春节放假安排 网站建设公司品牌宣传
  • 溧阳有没有做网站的公司关键词排名怎么做上去
  • 国家和城乡建设部网站首页网络软文范例
  • 益阳市建设局网站是什么结构优化
  • 福清市住房和城乡建设局网站疫情最新情况 最新消息 全国
  • 网站设计红色表示什么营销手段有哪些方式
  • 元谋网站建设重庆网站建设哪家好
  • c 做网站微信公众号运营推广方案
  • 许昌建设委员会网站百度搜索风云榜明星
  • 地方商城网站上海搜索优化推广
  • 怎么给QQ名片做网站以下属于网站seo的内容是
  • 如何做融资网站网站备案查询工信部
  • 公司网站优化怎么做网站站内推广怎么做
  • 做网站的的广告词seo产品推广
  • 免费查找资料的网站东莞今日头条新闻
  • 长沙网站搭建首选智投未来微信朋友圈推广
  • 南宁网站优化排名推广seo优化工具软件
  • 网页布局网站快速排名优化系统
  • 深圳网站设计制作元北京专业seo公司
  • 网站产品展示系统百度推广总部客服投诉电话
  • 微信开发者工具文档福州seo推广公司
  • 模板网站的缺点武汉百度信息流广告
  • wordpress聊天室插件seo是什么字
  • 甜品网站模板代码杭州网站seo价格
  • 才做的网站怎么搜不到如何在网上做销售推广
  • 做网站的三个软件深圳seo优化服务
  • 北京市建设教育协会网站论坛推广的步骤
  • 什么网站可以找人做系统soe搜索优化
  • 涪陵做网站百度推广的几种方式
  • 可以给别人做ps设计的网站模板网站建设
  • 网络原理 ——HTTPS
  • Vue3 从 0 到 ∞:Composition API 的底层哲学、渲染管线与生态演进全景
  • 哪个厂家生产的戒烟药好:从机制到体验的差异化博弈
  • Ubuntu系统下编译安装FreeSWITCH 1.10.12
  • Redis原理之哨兵机制(Sentinel)
  • MyBatis:配置文件完成增删改查_添加