Python:检查文件夹中是否有多个文件的最快方法

blu*_*isk 9 python

我正在寻找一种非常快速的方法来检查文件夹是否包含两个以上的文件。

我担心如果/path/len(os.listdir('/path/')) > 2中有很多文件,可能会变得非常慢,特别是因为该函数一次会被多个进程频繁调用。

Pon*_*UHC 10

PEP471确实引入了另一个功能:os.scandir(path)

当它返回一个生成器时,不会创建任何列表,并且最坏的情况(巨大的目录)仍然是轻量级的。

它的更高级别的界面os.walk(path)将允许您浏览一个目录,而不必列出所有目录。

这是针对您的具体情况的代码示例:

import os

MINIMUM_SIZE = 2

file_count = 0
for entry in os.scandir('.'):
    if entry.is_file():
        file_count += 1
    if file_count == MINIMUM_SIZE:
        break

enough_files = (file_count == MINIMUM_SIZE)
Run Code Online (Sandbox Code Playgroud)

  • 这确实是正确的,并且是迄今为止最快的方法,我将另一个答案标记为正确的,但是,由于额外的时间 (2认同)

Seb*_*zny 6

为了获得最快的速度,这可能是一件很棘手的事情。

我的猜测是:


def iterdir_approach(path):
    iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
    try:
        next(iter_of_files)
        next(iter_of_files)
        next(iter_of_files)
        return True
    except:
        return False
Run Code Online (Sandbox Code Playgroud)

我们创建一个生成器并尝试耗尽它,必要时捕获抛出的异常。

为了分析这些方法,我们创建了一堆目录,其中包含一堆文件:

import shutil
import tempfile
import timeit
import matplotlib.pyplot as plt
from pathlib import Path


def create_temp_directory(num_directories):
    temp_dir = tempfile.mkdtemp()
    for i in range(num_directories):
        dir_path = os.path.join(temp_dir, f"subdir_{i}")
        os.makedirs(dir_path)
        for j in range(random.randint(0,i)):
            file_path = os.path.join(dir_path, f"file_{j}.txt")
            with open(file_path, 'w') as file:
                file.write("Sample content")
    return temp_dir
Run Code Online (Sandbox Code Playgroud)

我们定义了各种方法(从问题的答案中复制了其他两种方法:


def iterdir_approach(path):
    #@swozny
    iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
    try:
        next(iter_of_files)
        next(iter_of_files)
        next(iter_of_files)
        return True
    except:
        return False

def len_os_dir_approach(path):
    #@bluppfisk
    return len(os.listdir(path)) > 2


def check_files_os_scandir_approach(path):
    #@PoneyUHC
    MINIMUM_SIZE = 3
    file_count = 0
    for entry in os.scandir(path):
        if entry.is_file():
            file_count += 1
        if file_count == MINIMUM_SIZE:
            return True
    return False


def path_resolve_approach(path):
    #@matleg
    directory_path = Path(path).resolve()
    nb_files = 0
    enough_files = False
    for file_path in directory_path.glob("*"):
        if file_path.is_file():
            nb_files += 1
        if nb_files > 2:
            return True
    return False

def dilettant_approach(path):
    #@dilettant
    gen = os.scandir(path)  # OP states only files in folder /path/
    enough = 3  # At least 2 files

    has_enough = len(list(itertools.islice(gen, enough))) >= enough

    return has_enough
def adrian_ang_approach(path):
    #@adrian_ang
    count = 0
    with os.scandir(path) as entries:
        for entry in entries:
            if entry.is_file():
                count += 1
                if count > 2:
                    return True
    return False

Run Code Online (Sandbox Code Playgroud)

然后我们使用以下方法分析代码timeit.timeit并绘制不同数量目录的执行时间:


num_directories_list = [10, 50, 100, 200, 500,1000]
approach1_times = []
approach2_times = []
approach3_times = []
approach4_times = []
approach5_times = []
approach6_times = []


for num_directories in num_directories_list:
    temp_dir = create_temp_directory(num_directories)
    subdir_paths = [str(p) for p in Path(create_temp_directory(num_directories)).iterdir()]
    approach1_time = timeit.timeit(lambda: [iterdir_approach(path)for path in subdir_paths], number=5)
    approach2_time = timeit.timeit(lambda: [check_files_os_scandir_approach(path)for path in subdir_paths], number=5)
    approach3_time = timeit.timeit(lambda: [path_resolve_approach(path)for path in subdir_paths], number=5)
    approach4_time = timeit.timeit(lambda: [len_os_dir_approach(path)for path in subdir_paths], number=5)
    approach5_time = timeit.timeit(lambda: [dilettant_approach(path)for path in subdir_paths], number=5)
    approach6_time = timeit.timeit(lambda: [adrian_ang_approach(path)for path in subdir_paths], number=5)


    approach1_times.append(approach1_time)
    approach2_times.append(approach2_time)
    approach3_times.append(approach3_time)
    approach4_times.append(approach4_time)
    approach5_times.append(approach5_time)
    approach6_times.append(approach6_time)




    shutil.rmtree(temp_dir)

Run Code Online (Sandbox Code Playgroud)

结果可视化


plt.plot(num_directories_list, approach1_times, label='iterdir_approach')
plt.plot(num_directories_list, approach2_times, label='check_files_os_scandir_approach')
plt.plot(num_directories_list, approach3_times, label='path_resolve_approach')
plt.plot(num_directories_list, approach4_times, label='os_dir_approach')
plt.plot(num_directories_list, approach5_times, label='dilettant_approach')
plt.plot(num_directories_list, approach6_times, label='adrian_ang_approach')


plt.xlabel('Number of Directories')
plt.ylabel('Execution Time (seconds)')
plt.title('Performance Comparison')
plt.legend()
plt.show()

Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

最佳 3 个解决方案的特写: 在此输入图像描述

  • 已修复,谢谢!我虽然用“>”很好,但这还不够 (2认同)