我正在尝试优化下面的代码。当运行 1000 行数据时,大约需要 12 分钟才能完成。我们的用例需要数据大小约为 25K - 50K 行,这将使此实现完全不可行。
import pyspark.sql.types as Types
import numpy
import spacy
from pyspark.sql.functions import udf
inputPath = "s3://myData/part-*.parquet"
df = spark.read.parquet(inputPath)
test_df = df.select('uid', 'content').limit(1000).repartition(10)
# print(df.rdd.getNumPartitions()) -> 4
# print(test_df.rdd.getNumPartitions()) -> 1
def load_glove(fn):
vector_dict = {}
count = 0
with open(fn) as inf:
for line in inf:
count += 1
eles = line.strip().split()
token = eles[0]
try:
vector_dict[token] = numpy.array([float(x) for x in eles[1:]])
assert len(vector_dict[token]) == 300
except:
print("Exception in load_glove") …Run Code Online (Sandbox Code Playgroud)