我正在尝试继承 DataFrame 类并添加其他自定义方法,如下所示,以便我可以流畅地链接并确保所有方法引用相同的数据帧。我收到异常,因为列不可迭代
from pyspark.sql.dataframe import DataFrame
class Myclass(DataFrame):
def __init__(self,df):
super().__init__(df._jdf, df.sql_ctx)
def add_column3(self):
// Add column1 to dataframe received
self._jdf.withColumn("col3",lit(3))
return self
def add_column4(self):
// Add column to dataframe received
self._jdf.withColumn("col4",lit(4))
return self
if __name__ == "__main__":
'''
Spark Context initialization code
col1 col2
a 1
b 2
'''
df = spark.createDataFrame([("a",1), ("b",2)], ["col1","col2"])
myobj = MyClass(df)
## Trying to accomplish below where i can chain MyClass methods & Dataframe methods
myobj.add_column3().add_column4().drop_columns(["col1"])
'''
Expected Output
col2, col3,col4
1,3,4 …Run Code Online (Sandbox Code Playgroud)