我在python中有一个大约5000行的csv文件我想将它分成五个文件.
我为它写了一个代码,但它没有用
import codecs
import csv
NO_OF_LINES_PER_FILE = 1000
def again(count_file_header,count):
f3 = open('write_'+count_file_header+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
co = 0
for row in candidate_info_reader:
co = co + 1
count = count + 1
if count <= count:
pass
elif count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
def read_write():
f3 = open('write_'+NO_OF_LINES_PER_FILE+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
count = 0
for row in candidate_info_reader:
count = count + 1
if count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
read_write()
Run Code Online (Sandbox Code Playgroud)
上面的代码创建了许多空内容的文件.
如何将一个文件拆分为五个csv文件?
Azi*_*lto 21
在Python中
使用readlines()和writelines()执行此操作,这是一个示例:
>>> csvfile = open('import_1458922827.csv', 'r').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
... if i % 1000 == 0:
... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000])
... filename += 1
Run Code Online (Sandbox Code Playgroud)
输出文件名称将被编号1.csv,2.csv,...等等.
从终端
仅供参考,您可以使用split以下命令从命令行执行此操作:
$ split -l 1000 import_1458922827.csv
Run Code Online (Sandbox Code Playgroud)
Rud*_*koŭ 16
我建议你不要发明一个轮子.有现成的解决方案.来源于此
import os
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
Run Code Online (Sandbox Code Playgroud)
使用它像:
split(open('/your/pat/input.csv', 'r'));
Run Code Online (Sandbox Code Playgroud)
一个带有 Pandas 的简单 Python 3 解决方案,不会切断最后一批
def to_csv_batch(src_csv, dst_dir, size=30000, index=False):
import pandas as pd
import math
# Read source csv
df = pd.read_csv(src_csv)
# Initial values
low = 0
high = size
# Loop through batches
for i in range(math.ceil(len(df) / size)):
fname = dst_dir+'/Batch_' + str(i+1) + '.csv'
df[low:high].to_csv(fname, index=index)
# Update selection
low = high
if (high + size < len(df)):
high = high + size
else:
high = len(df)
Run Code Online (Sandbox Code Playgroud)
使用示例
to_csv_batch('Batch_All.csv', 'Batches')
Run Code Online (Sandbox Code Playgroud)
一个python3友好的解决方案:
def split_csv(source_filepath, dest_folder, split_file_prefix,
records_per_file):
"""
Split a source csv into multiple csvs of equal numbers of records,
except the last file.
Includes the initial header row in each split file.
Split files follow a zero-index sequential naming convention like so:
`{split_file_prefix}_0.csv`
"""
if records_per_file <= 0:
raise Exception('records_per_file must be > 0')
with open(source_filepath, 'r') as source:
reader = csv.reader(source)
headers = next(reader)
file_idx = 0
records_exist = True
while records_exist:
i = 0
target_filename = f'{split_file_prefix}_{file_idx}.csv'
target_filepath = os.path.join(dest_folder, target_filename)
with open(target_filepath, 'w') as target:
writer = csv.writer(target)
while i < records_per_file:
if i == 0:
writer.writerow(headers)
try:
writer.writerow(next(reader))
i += 1
except StopIteration:
records_exist = False
break
if i == 0:
# we only wrote the header, so delete that file
os.remove(target_filepath)
file_idx += 1
Run Code Online (Sandbox Code Playgroud)
我稍微修改了接受的答案以使其更简单
编辑:添加了导入语句,修改了打印异常的打印语句。header_row = rows.__next__()@Alex F 代码片段是为 python2 编写的,对于 python3,您还需要使用header_row = rows.next(). 谢谢你指出。
import os
import csv
def split_csv_into_chunks(file_location, out_dir, file_size=2):
count = 0
current_piece = 1
# file_to_split_name.csv
file_name = file_location.split("/")[-1].split(".")[0]
split_file_name_template = file_name + "__%s.csv"
splited_files_path = []
if not os.path.exists(out_dir):
os.makedirs(out_dir)
try:
with open(file_location, "rb") as csv_file:
rows = csv.reader(csv_file, delimiter=",")
headers_row = rows.next()
for row in rows:
if count % file_size == 0:
current_out_path = os.path.join(out_dir,
split_file_name_template%str(current_piece))
current_out_writer = None
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",")
current_out_writer.writerow(headers_row)
splited_files_path.append(current_out_path)
current_piece += 1
current_out_writer.writerow(row)
count += 1
return True, splited_files_path
except Exception as e:
print("Exception occurred as {}".format(e))
return False, splited_files_path
Run Code Online (Sandbox Code Playgroud)