我有一个大小等于 170kB 的 csv,当我将它们转换为镶木地板文件时,大小为 1.2MB。数据结构是带有字符串的 12 列。
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
csv_filename = "../files/test.csv"
parquet_filename = '../files/sample.parquet'
chunksize = 1
pqwriter = None
for i, df in enumerate(pd.read_csv(csv_filename, delimiter='_;_', chunksize=chunksize)):
#df = df.astype(str)
table = pa.Table.from_pandas(df=df)
# for the first chunk of records
if i == 0:
# create a parquet write object giving it an output file
pqwriter = pq.ParquetWriter(parquet_filename, table.schema, compression='gzip', use_dictionary=False)
pqwriter.write_table(table)
# close the parquet writer
if pqwriter: …Run Code Online (Sandbox Code Playgroud)