我正在尝试从一组压缩的 CSV 文件创建 dask 数据框。阅读问题,似乎 dask 需要使用 dask.distributedelasted()
import glob
import dask.dataframe as dd
import zipfile
import pandas as pd
from dask.delayed import delayed
#Create zip_dict with key-value pairs for .zip & .csv names
file_list = glob.glob('my_directory/zip_files/')
zip_dict = {}
for f in file_list:
key = f.split('/')[5][:-4]
zip_dict[key] = zipfile.ZipFile(f)
Run Code Online (Sandbox Code Playgroud)
zip_dict = {'log20160201': zipfile.ZipFile filename='/my_directory/zip_files/log20160201.zip' mode='r', 'log20160218': zipfile.ZipFile filename='/my_directory/zip_files/log20160218.zip' 的示例内容模式='r'}
# Create list of delayed pd.read_csv()
d_rows = []
for k, v in zip_dict.items():
row = delayed(pd.read_csv)(v.open(k+'.csv'),usecols=['time','cik'])
d_rows.append(row)
v.close()
Run Code Online (Sandbox Code Playgroud)
d_rows …