Vis*_*akh 6 hadoop kerberos hadoop-yarn apache-spark cloudera-cdh
我们有一个在 Yarn 上运行 Spark 的 kerberized 集群。目前,我们在本地用 Scala 编写 Spark 代码,然后构建一个胖 JAR,我们将其复制到集群中,然后运行 spark-submit。我想在我的本地 PC 上编写 Spark 代码并让它直接在集群上运行。有没有直接的方法来做到这一点?Spark 文档似乎没有任何这样的模式。
仅供参考,我的本地机器正在运行 Windows 并且集群正在运行CDH。
虽然 cricket007 的答案适用于 Spark-Submit,但以下是我使用 IntelliJ 对远程集群运行的操作:
首先,确保客户端和服务器端的 JAR 相同。由于我们使用的是 CDH 7.1,因此我确保所有 JAR 都来自特定发行版。
按照 cricket007 的答案中所述设置 HADOOP_CONF_DIR 和 YARN_CONF_DIR 。在 Spark conf 中根据需要设置“spark.yarn.principal”和“spark.yarn.keytab”。
如果连接到 HDFS,请确保在 build.sbt 中设置以下排除规则:
libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.6.0-cdh5.7.1" excludeAll ExclusionRule(organization = "javax.servlet")
Run Code Online (Sandbox Code Playgroud)
确保spark-launcher 和spark-yarn JAR 已在build.sbt 中列出。
libraryDependencies += "org.apache.spark" %% "spark-launcher" % "1.6.0-cdh5.7.1"
libraryDependencies += "org.apache.spark" %% "spark-yarn" % "1.6.0-cdh5.7.1"
Run Code Online (Sandbox Code Playgroud)
在服务器上找到 CDH JAR,并将其复制到 HDFS 上的已知位置。将以下行添加到您的代码中:
final val CDH_JAR_PATH = "/opt/cloudera/parcels/CDH/jars"
final val hadoopJars: Seq[String] = Seq[String](
"hadoop-annotations-2.6.0-cdh5.7.1.jar"
, "hadoop-ant-2.6.0-cdh5.7.1.jar"
, "hadoop-ant-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-archive-logs-2.6.0-cdh5.7.1.jar"
, "hadoop-archives-2.6.0-cdh5.7.1.jar"
, "hadoop-auth-2.6.0-cdh5.7.1.jar"
, "hadoop-aws-2.6.0-cdh5.7.1.jar"
, "hadoop-azure-2.6.0-cdh5.7.1.jar"
, "hadoop-capacity-scheduler-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-common-2.6.0-cdh5.7.1.jar"
, "hadoop-core-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-datajoin-2.6.0-cdh5.7.1.jar"
, "hadoop-distcp-2.6.0-cdh5.7.1.jar"
, "hadoop-examples-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-examples.jar"
, "hadoop-extras-2.6.0-cdh5.7.1.jar"
, "hadoop-fairscheduler-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-gridmix-2.6.0-cdh5.7.1.jar"
, "hadoop-gridmix-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-hdfs-2.6.0-cdh5.7.1.jar"
, "hadoop-hdfs-nfs-2.6.0-cdh5.7.1.jar"
, "hadoop-kms-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-app-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-common-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-core-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-hs-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-hs-plugins-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-jobclient-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-nativetask-2.6.0-cdh5.7.1.jar"
, "hadoop-mapreduce-client-shuffle-2.6.0-cdh5.7.1.jar"
, "hadoop-nfs-2.6.0-cdh5.7.1.jar"
, "hadoop-openstack-2.6.0-cdh5.7.1.jar"
, "hadoop-rumen-2.6.0-cdh5.7.1.jar"
, "hadoop-sls-2.6.0-cdh5.7.1.jar"
, "hadoop-streaming-2.6.0-cdh5.7.1.jar"
, "hadoop-streaming-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-tools-2.6.0-mr1-cdh5.7.1.jar"
, "hadoop-yarn-api-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-applications-distributedshell-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-applications-unmanaged-am-launcher-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-client-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-common-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-registry-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-server-applicationhistoryservice-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-server-common-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-server-nodemanager-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-server-resourcemanager-2.6.0-cdh5.7.1.jar"
, "hadoop-yarn-server-web-proxy-2.6.0-cdh5.7.1.jar"
, "hbase-hadoop2-compat-1.2.0-cdh5.7.1.jar"
, "hbase-hadoop-compat-1.2.0-cdh5.7.1.jar")
final val sparkJars: Seq[String] = Seq[String](
"spark-1.6.0-cdh5.7.1-yarn-shuffle.jar",
"spark-assembly-1.6.0-cdh5.7.1-hadoop2.6.0-cdh5.7.1.jar",
"spark-avro_2.10-1.1.0-cdh5.7.1.jar",
"spark-bagel_2.10-1.6.0-cdh5.7.1.jar",
"spark-catalyst_2.10-1.6.0-cdh5.7.1.jar",
"spark-core_2.10-1.6.0-cdh5.7.1.jar",
"spark-examples-1.6.0-cdh5.7.1-hadoop2.6.0-cdh5.7.1.jar",
"spark-graphx_2.10-1.6.0-cdh5.7.1.jar",
"spark-hive_2.10-1.6.0-cdh5.7.1.jar",
"spark-launcher_2.10-1.6.0-cdh5.7.1.jar",
"spark-mllib_2.10-1.6.0-cdh5.7.1.jar",
"spark-network-common_2.10-1.6.0-cdh5.7.1.jar",
"spark-network-shuffle_2.10-1.6.0-cdh5.7.1.jar",
"spark-repl_2.10-1.6.0-cdh5.7.1.jar",
"spark-sql_2.10-1.6.0-cdh5.7.1.jar",
"spark-streaming-flume-sink_2.10-1.6.0-cdh5.7.1.jar",
"spark-streaming-flume_2.10-1.6.0-cdh5.7.1.jar",
"spark-streaming-kafka_2.10-1.6.0-cdh5.7.1.jar",
"spark-streaming_2.10-1.6.0-cdh5.7.1.jar",
"spark-unsafe_2.10-1.6.0-cdh5.7.1.jar",
"spark-yarn_2.10-1.6.0-cdh5.7.1.jar")
def getClassPath(jarNames: Seq[String], pathPrefix: String): String = {
jarNames.foldLeft("")((cp, name) => s"$cp:$pathPrefix/$name").drop(1)
Run Code Online (Sandbox Code Playgroud)
}
创建 SparkConf 时添加以下行:
.set("spark.driver.extraClassPath", getClassPath(sparkJars ++ hadoopJars, CDH_JAR_PATH))
.set("spark.executor.extraClassPath", getClassPath(sparkJars ++ hadoopJars, CDH_JAR_PATH))
.set("spark.yarn.jars", "hdfs://$YOUR_MACHINE/PATH_TO_JARS/*")
Run Code Online (Sandbox Code Playgroud)
您的程序现在应该可以运行了。