5 python csv glob traversal pandas
我是 Python 新手并试图了解某些库。不确定如何将 csv 上传到 SO,但此脚本适用于任何 csv,只需替换“SwitchedProviders_TopicModel”
我的目标是遍历文件目录中的所有 csv - C:\Users\jj\Desktop\autotranscribe 并将我的 python 脚本输出按文件写入 csv。
所以让我们说例如我在上面的文件夹中有这些 csv 文件 -
'1003391793_1003391784_01bc7e411408166f7c5468f0.csv' '1003478130_1003478103_8eef05b0820cf0ffe9a975870305c80fc80fc8040304000000000000000000000000000 '1003478130_1003478103_8eef05b0820cf0ffe9a9758703040305c80fc80fc80305c80fc8f03030csv'166f7c5468f0.csv' '1003478130_1003478103_8
我希望我的 python 应用程序(下面)为文件夹/目录中的每个 csv 做一个字计数器,并将输出写入这样的数据帧 -
csvname pre existing exclusions limitations fourteen
1003391793_1003391784_01bc7e411408166f7c5468f0.csv 1 2 0 1
Run Code Online (Sandbox Code Playgroud)
我的剧本 -
import pandas as pd
from collections import defaultdict
def search_multiple_strings_in_file(file_name, list_of_strings):
"""Get line from the file along with line numbers, which contains any string from the list"""
line_number = 0
list_of_results = []
count = defaultdict(lambda: 0)
# Open the file in read only mode
with open("SwitchedProviders_TopicModel.csv", 'r') as read_obj:
# Read all lines in the file one by one
for line in read_obj:
line_number += 1
# For each line, check if line contains any string from the list of strings
for string_to_search in list_of_strings:
if string_to_search in line:
count[string_to_search] += line.count(string_to_search)
# If any string is found in line, then append that line along with line number in list
list_of_results.append((string_to_search, line_number, line.rstrip()))
# Return list of tuples containing matched string, line numbers and lines where string is found
return list_of_results, dict(count)
matched_lines, count = search_multiple_strings_in_file('SwitchedProviders_TopicModel.csv', [ 'pre existing ', 'exclusions','limitations','fourteen'])
df = pd.DataFrame.from_dict(count, orient='index').reset_index()
df.columns = ['Word', 'Count']
print(df)
Run Code Online (Sandbox Code Playgroud)
我怎么能做到这一点?只查找计数器特定的单词,如您在我的脚本中看到的“十四”,而不是查找所有单词的计数器
csvs 之一的样本数据 - 信用用户 Umar H
df = pd.read_csv('1003478130_1003478103_8eef05b0820cf0ffe9a9754c.csv')
print(df.head(10).to_dict())
{'transcript': {0: 'hi thanks for calling ACCA this is many speaking could have the pleasure speaking with ', 1: 'so ', 2: 'hi ', 3: 'I have the pleasure speaking with my name is B. as in boy E. V. D. N. ', 4: 'thanks yes and I think I have your account pulled up could you please verify your email ', 5: "sure is yeah it's on _ 00 ", 6: 'I T. O.com ', 7: 'thank you how can I help ', 8: 'all right I mean I do have an insurance with you guys I just want to cancel the insurance ', 9: 'sure I can help with that what was the reason for cancellation '}, 'confidence': {0: 0.73, 1: 0.18, 2: 0.88, 3: 0.72, 4: 0.83, 5: 0.76, 6: 0.83, 7: 0.98, 8: 0.89, 9: 0.95}, 'from': {0: 1.69, 1: 1.83, 2: 2.06, 3: 2.13, 4: 2.36, 5: 2.98, 6: 3.17, 7: 3.65, 8: 3.78, 9: 3.93}, 'to': {0: 1.83, 1: 2.06, 2: 2.13, 3: 2.36, 4: 2.98, 5: 3.17, 6: 3.65, 7: 3.78, 8: 3.93, 9: 4.14}, 'speaker': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'Negative': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.116, 9: 0.0}, 'Neutral': {0: 0.694, 1: 1.0, 2: 1.0, 3: 0.802, 4: 0.603, 5: 0.471, 6: 1.0, 7: 0.366, 8: 0.809, 9: 0.643}, 'Positive': {0: 0.306, 1: 0.0, 2: 0.0, 3: 0.198, 4: 0.397, 5: 0.529, 6: 0.0, 7: 0.634, 8: 0.075, 9: 0.357}, 'compound': {0: 0.765, 1: 0.0, 2: 0.0, 3: 0.5719, 4: 0.7845, 5: 0.5423, 6: 0.0, 7: 0.6369, 8: -0.1779, 9: 0.6124}}
Run Code Online (Sandbox Code Playgroud)
脚步 -
import string
from collections import Counter, defaultdict
from pathlib import Path
import pandas as pd
inp_dir = Path(r'C:/Users/jj/Desktop/Bulk_Wav_Completed') # current dir
def search_multiple_strings_in_file(file_name, list_of_strings):
"""Get line from the file along with line numbers, which contains any string from the list"""
list_of_results = []
count = defaultdict(lambda: 0)
# Open the file in read only mode
with open(file_name, 'r') as read_obj:
# Read all lines in the file one by one
for line_number, line in enumerate(read_obj, start=1):
# For each line, check if line contains any string from the list of strings
for string_to_search in list_of_strings:
if string_to_search in line:
count[string_to_search] += line.count(string_to_search)
# If any string is found in line, then append that line along with line number in list
list_of_results.append(
(string_to_search, line_number, line.rstrip()))
# Return list of tuples containing matched string, line numbers and lines where string is found
return list_of_results, dict(count)
result = {}
for csv_file in inp_dir.glob('**/*.csv'):
print(csv_file) # for debugging
matched_lines, count = search_multiple_strings_in_file(csv_file, ['nation', 'nation wide', 'trupanion', 'pet plan', 'best', 'embrace', 'healthy paws', 'pet first', 'pet partners', 'lemon',
'AKC', 'akc', 'kennel club', 'club', 'american kennel', 'american', 'lemonade'
'kennel', 'figo', 'companion protect', 'true companion',
'true panion', 'trusted pals', 'partners' 'lemonade', 'partner',
'wagmo', 'vagmo', 'bivvy', 'bivy', 'bee' '4paws', 'paws', 'pet best',
'pets best', 'pet best'])
print(count) # for debugging
result[csv_file.name] = count
df = pd.DataFrame(result).T.fillna(0).astype(int)
Run Code Online (Sandbox Code Playgroud)
输出 -
exclusions limitations pre existing
1.csv 1 3 1
2.csv 1 3 1
Run Code Online (Sandbox Code Playgroud)