Py4JJavaError:尝试将 rdd 数据帧写入本地目录上的 parquet 文件时调用 o389.parquet 时发生错误

Sta*_*tSS 5 python typeerror apache-spark parquet pyspark

我正在尝试在 Jupyter 笔记本中使用以下代码编写一个数据帧以在本地目录上拼花文件:

rdd1 = rdd.coalesce(partitions)

schema1 = StructType([StructField('date', DateType()), StructField('open', FloatType()), StructField('high', FloatType()),
           StructField('low', FloatType()),StructField('close', FloatType()),StructField('adj_close', FloatType()),
           StructField('volume', FloatType()), StructField('stock', StringType())])

rddDF = spark.createDataFrame(rdd1,schema=schema1)

spark.conf.set("spark.sql.parquet.compression.codec", "gzip")

rddDF.write.parquet("C:/Users/"User"/Documents/File/Output/rddDF")
Run Code Online (Sandbox Code Playgroud)

我收到以下错误:

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-11-7b2aeb627267> in <module>
    16 
    17 #rddDF.to_parquet("C:/Users/Sabihah/Documents/6. Processing Big Data/Output/rddDF")
---> 18 rddDF.write.parquet("C:/Users/Sabihah/Documents/6. Processing Big Data/Output/rddDF")
    19 #rddDF.write.format("parquet").save("C:/Users/Sabihah/Documents/6. Processing Big Data/Output/rddDF")

~\anaconda3\lib\site-packages\pyspark\sql\readwriter.py in parquet(self, path, mode, partitionBy, compression)
   883             self.partitionBy(partitionBy)
   884         self._set_opts(compression=compression)
--> 885         self._jwrite.parquet(path)
   886 
   887     def text(self, path, compression=None, lineSep=None):

~\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
  1307 
  1308         answer = self.gateway_client.send_command(command)
-> 1309         return_value = get_return_value(
  1310             answer, self.gateway_client, self.target_id, self.name)
  1311 

~\anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
   109     def deco(*a, **kw):
   110         try:
--> 111             return f(*a, **kw)
   112         except py4j.protocol.Py4JJavaError as e:
   113             converted = convert_exception(e.java_exception)

~\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
   324             value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
   325             if answer[1] == REFERENCE_TYPE:
--> 326                 raise Py4JJavaError(
   327                     "An error occurred while calling {0}{1}{2}.\n".
   328                     format(target_id, ".", name), value)

Py4JJavaError: An error occurred while calling o48.parquet.
: org.apache.spark.SparkException: Job aborted.
   at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:496)
   at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:251)
   at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
   at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
   at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
   at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
   at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
   at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
   at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
   at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
   at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
   at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
   at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
   at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
   at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
   at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
   at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
   at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
   at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
   at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
   at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
   at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
   at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
   at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128)
   at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
   at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
   at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
   at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
   at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:781)
   at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
   at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
   at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
   at java.lang.reflect.Method.invoke(Unknown Source)
   at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
   at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
   at py4j.Gateway.invoke(Gateway.java:282)
   at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
   at py4j.commands.CallCommand.execute(CallCommand.java:79)
   at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
   at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
   at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1) (DESKTOP-JBUENQG executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 619, in main
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 611, in process
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\serializers.py", line 259, in dump_stream
   vs = list(itertools.islice(iterator, batch))
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\util.py", line 74, in wrapper
   return f(*args, **kwargs)
 File "C:\Users\Sabihah\anaconda3\lib\site-packages\pyspark\sql\session.py", line 682, in prepare
   verify_func(obj)
 File "C:\Users\Sabihah\anaconda3\lib\site-packages\pyspark\sql\types.py", line 1411, in verify
   verify_value(obj)
 File "C:\Users\Sabihah\anaconda3\lib\site-packages\pyspark\sql\types.py", line 1398, in verify_struct
   raise TypeError(new_msg("StructType can not accept object %r in type %s"
TypeError: StructType can not accept object 'close' in type <class 'str'>

   at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
   at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
   at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
   at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
   at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
   at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
   at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
   at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
   at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
   at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
   at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
   at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)
   at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$16(FileFormatWriter.scala:229)
   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
   at org.apache.spark.scheduler.Task.run(Task.scala:131)
   at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
   at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
   at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
   at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
   at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
   at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
   at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
   at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
   at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
   at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
   at scala.Option.foreach(Option.scala:407)
   at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
   at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
   at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
   at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:218)
   ... 42 more
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 619, in main
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\worker.py", line 611, in process
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\serializers.py", line 259, in dump_stream
   vs = list(itertools.islice(iterator, batch))
 File "C:\Spark\Spark\python\lib\pyspark.zip\pyspark\util.py", line 74, in wrapper
   return f(*args, **kwargs)
 File "C:\Users\Sabihah\anaconda3\lib\site-packages\pyspark\sql\session.py", line 682, in prepare
   verify_func(obj)
 File "C:\Users\Sabihah\anaconda3\lib\site-packages\pyspark\sql\types.py", line 1411, in verify
   verify_value(obj)
 File "C:\Users\Sabihah\anaconda3\lib\site-packages\pyspark\sql\types.py", line 1398, in verify_struct
   raise TypeError(new_msg("StructType can not accept object %r in type %s"
TypeError: StructType can not accept object 'close' in type <class 'str'>

   at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
   at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
   at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
   at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
   at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
   at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
   at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
   at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
   at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
   at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
   at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
   at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)
   at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$16(FileFormatWriter.scala:229)
   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
   at org.apache.spark.scheduler.Task.run(Task.scala:131)
   at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
   at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
   at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
   at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
   ... 1 more
Run Code Online (Sandbox Code Playgroud)

我检查了所有系统变量:Hadoop_home、Java_home、Spark_home、Scala_home、Pyspark_python、Pyspark_driver_python。

我已经安装了 Spark v3.2、Hadoop v2.7 和 Scala 2.12.4,更新到 v2.12.10。我在笔记本中使用 Python 3.8。

我尝试降级到 Python 3.7,但这并没有解决问题。

我不确定还可以尝试什么来修复此错误。任何帮助,将不胜感激。

更新:我尝试修复数据类型,但错误仍然存​​在。

然后,我对创建数据框的方式进行了以下更改:

rddDF = spark.createDataFrame([rdd1],schema=schema1)
Run Code Online (Sandbox Code Playgroud)

这消除了类型错误:

StructType can not accept object 'close' in type <class 'str'>
Run Code Online (Sandbox Code Playgroud)

我的错误现在显示:

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-23-ec6844f98c97> in <module>
     16 
     17 #rddDF.to_parquet("C:/Users/Sabihah/Documents/6. Processing Big Data/Output/rddDF")
---> 18 rddDF.write.parquet("C:/Users/Sabihah/Documents/6. Processing Big Data/Output/rddDF")
     19 #rddDF.write.format("parquet").save("C:/Users/Sabihah/Documents/6. Processing Big Data/Output/rddDF")

~\anaconda3\lib\site-packages\pyspark\sql\readwriter.py in parquet(self, path, mode, partitionBy, compression)
    883             self.partitionBy(partitionBy)
    884         self._set_opts(compression=compression)
--> 885         self._jwrite.parquet(path)
    886 
    887     def text(self, path, compression=None, lineSep=None):

~\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
   1307 
   1308         answer = self.gateway_client.send_command(command)
-> 1309         return_value = get_return_value(
   1310             answer, self.gateway_client, self.target_id, self.name)
   1311 

~\anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
    109     def deco(*a, **kw):
    110         try:
--> 111             return f(*a, **kw)
    112         except py4j.protocol.Py4JJavaError as e:
    113             converted = convert_exception(e.java_exception)

~\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
    324             value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325             if answer[1] == REFERENCE_TYPE:
--> 326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
    328                     format(target_id, ".", name), value)

Py4JJavaError: An error occurred while calling o389.parquet.
: org.apache.spark.SparkException: Job aborted.
    at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:496)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:251)
    at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
    at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
    at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
    at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
    at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
    at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
    at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
    at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
    at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
    at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
    at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
    at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
    at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
    at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128)
    at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
    at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
    at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
    at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
    at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:781)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl