Ale*_*dru 8 .net c# windows boost visual-c++
我正在开发文件同步服务,以便在不同计算机上的两个文件夹之间同步文件.我需要找到一种非常快速的方法来枚举目录并从中提取以下信息:
到目前为止,我已经想出了这个:
static void Main(string[] args)
{
List<Tuple<string, DateTime>> files = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directories = new List<Tuple<string, DateTime>>();
Stopwatch watch = new Stopwatch();
while (true)
{
watch.Start();
while (!CheckFolderRecursiveSingleThreaded("C:\\", out files, out directories))
{
// You can assume for all intents and purposes that drive C does exist and that you have access to it, which will cause this sleep to not get called.
Thread.Sleep(1000);
}
watch.Stop();
Console.WriteLine(watch.ElapsedMilliseconds);
watch.Reset();
// Do something with the information.
Thread.Sleep(1000);
}
}
static bool CheckFolderRecursiveSingleThreaded(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
try
{
DirectoryInfo directoryInformation = new DirectoryInfo(path);
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
foreach (FileInfo file in directoryInformation.GetFiles())
{
fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
}
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
foreach (DirectoryInfo directory in directoryInformation.GetDirectories())
{
// Check for the ReparsePoint flag, which will indicate a symbolic link.
if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
List<Tuple<string, DateTime>> directoryFiles;
List<Tuple<string, DateTime>> directoryFolders;
if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
{
fileList.AddRange(directoryFiles);
directoryList.AddRange(directoryFolders);
}
}
}
files = fileList;
directories = directoryList;
return true;
}
catch
{
files = null;
directories = null;
return false;
}
}
Run Code Online (Sandbox Code Playgroud)
Performance-wise, it takes about 22 seconds (regardless of running in release or debug mode without the debugger attached) to enumerate through my C:\ drive and produce a list of about 549,254 files and 83,235 folders it has access to, but can it be faster? I am open to any suggestions, even MSVC++ suggestions.
Edit: 12 seconds with LINQ's AsParallel because of multithreading (must be tested in Release mode). Note that this parallelizes for all C:\ sub-folders, but recursive calls will be made to the single-threaded implementation I have above, otherwise it would take a REALLY long time to parallelize for all folders all the time!
static bool CheckFolderParallelled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
try
{
DirectoryInfo directoryInformation = new DirectoryInfo(path);
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
foreach (FileInfo file in directoryInformation.GetFiles())
{
fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
}
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
directoryInformation.GetDirectories().AsParallel().ForAll(directory =>
{
// Check for the ReparsePoint flag, which will indicate a symbolic link.
if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
List<Tuple<string, DateTime>> directoryFiles;
List<Tuple<string, DateTime>> directoryFolders;
if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
{
fileList.AddRange(directoryFiles);
directoryList.AddRange(directoryFolders);
}
}
});
files = fileList;
directories = directoryList;
return true;
}
catch
{
files = null;
directories = null;
return false;
}
}
Run Code Online (Sandbox Code Playgroud)
Edit: Still about 21 seconds using Alexei's linked solution's accepted answer from Mark Gravell. This non-recursive technique is not the fastest (probably the cost of keeping this Queue datatype alive is just as expensive as the cost of pushing and popping calls to this method on the stack):
static bool CheckFolderNonRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
try
{
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
ConcurrentQueue<DirectoryInfo> pendingSearches = new ConcurrentQueue<DirectoryInfo>();
pendingSearches.Enqueue(new DirectoryInfo(path));
DirectoryInfo pendingDirectory;
while (pendingSearches.Count > 0)
{
if (pendingSearches.TryDequeue(out pendingDirectory))
{
try
{
foreach (FileInfo file in pendingDirectory.GetFiles())
{
fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
}
foreach (DirectoryInfo directory in pendingDirectory.GetDirectories())
{
// Check for the ReparsePoint flag, which will indicate a symbolic link.
if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
pendingSearches.Enqueue(directory);
}
}
}
catch { } // Ignore directories with no access rights.
}
}
files = fileList;
directories = directoryList;
return true;
}
catch
{
files = null;
directories = null;
return false;
}
}
Run Code Online (Sandbox Code Playgroud)
Edit: This question is open-ended towards .NET, because there may be a faster way with MSVC++ libraries like boost, but I have yet to come across a faster method. If anyone can beat my C# method with a faster C drive enumerator in C++ that pulls the same data, first of all kudos to you for doing it faster, second of all I would be really interested to see it, third of all it would help a lot of people out (not just myself). I got this far into boost until I realized the following method took around 200,000 ms, much, much longer than any code I've posted above:
#include "stdafx.h"
#include <iostream>
#include <Windows.h>
#include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
#include <boost/timer.hpp>
namespace fs = boost::filesystem;
bool IterateDirectory(const wchar_t *directory);
int _tmain(int argc, _TCHAR* argv[])
{
boost::timer timer = boost::timer();
while (true)
{
timer.restart();
// L makes it wide, since IterateDirectory takes wchar_t.
// R makes it a raw string literal, which tells the compiler to parse the string as-is, not escape characters and fancy tricks.
IterateDirectory(LR"(C:\)");
std::cout << "Elapsed time: " << timer.elapsed() * 1000 << " ms" << std::endl;
Sleep(1000);
}
return 0;
}
// IterateDirectory takes wchar_t because path.c_str() always returns wchar_t whether you are using unicode or multibyte.
bool IterateDirectory(const wchar_t *directory)
{
if (boost::filesystem::exists(directory))
{
fs::directory_iterator it(directory), eod;
BOOST_FOREACH(fs::path path, std::make_pair(it, eod))
{
try
{
if (is_regular_file(path))
{
//std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
}
if (is_directory(path))
{
//std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
// path.c_str() always returns wchar_t, whether you are using unicode or multibyte. This is probably because of multi-language support inside of the Windows operating system and file structure.
IterateDirectory(path.c_str());
}
}
catch (...) { } // Ignore directories we don't have access to.
}
return true;
}
return false;
}
Run Code Online (Sandbox Code Playgroud)
Edit: Using PInvoke to FindFirstFile and FindNextFile took about 6 seconds to iterate my entire C drive (thanks to the duplicate link and Sam Saffron's answer). But...can it be faster?
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
public FileAttributes dwFileAttributes;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
public int nFileSizeHigh;
public int nFileSizeLow;
public int dwReserved0;
public int dwReserved1;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
public string cFileName;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
public string cAlternateFileName;
}
static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);
static bool FindNextFilePInvokeRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string,DateTime>>();
try
{
findHandle = FindFirstFileW(path + @"\*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
if (findData.cFileName == "." || findData.cFileName == "..") continue;
string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
{
fileList.AddRange(subDirectoryFileList);
directoryList.AddRange(subDirectoryDirectoryList);
}
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
}
}
while (FindNextFile(findHandle, out findData));
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
public static class FILETIMEExtensions
{
public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME filetime)
{
long highBits = filetime.dwHighDateTime;
highBits = highBits << 32;
return DateTime.FromFileTimeUtc(highBits + (long)filetime.dwLowDateTime);
}
}
Run Code Online (Sandbox Code Playgroud)
Edit: Yes, it can be faster. Using techniques to parallelize the target folder's subdirectory recursions, I can get it to 4 seconds using the above FindNextFilePInvokeRecursive method. That's 4 seconds to iterate my entire C drive with the data I need. I can see in process monitor, I eat up about 30% CPU and only 1% disk at most, which is a little odd to me, not sure why that is at the moment, maybe just this linked list traversal style causes it to be pretty negligible. Ideally it should eat 100% CPU at least, but that might depend on the number and depth of subfolders you parallelize on. But can it be faster?!
static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
try
{
findHandle = FindFirstFileW(path + @"\*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
if (findData.cFileName == "." || findData.cFileName == "..") continue;
string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
}
}
while (FindNextFile(findHandle, out findData));
directoryList.AsParallel().ForAll(x =>
{
List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
if (FindNextFilePInvokeRecursive(x.Item1, out subDirectoryFileList, out subDirectoryDirectoryList))
{
fileList.AddRange(subDirectoryFileList);
directoryList.AddRange(subDirectoryDirectoryList);
}
});
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
Run Code Online (Sandbox Code Playgroud)
编辑:忘记在使用parallels时添加并发锁,否则您可能会遇到异常.还删除了元组,并为我的目的使用了FileInformation/DirectoryInformation类.这刮掉了0.5秒.现在3.5秒枚举我的C:驱动器.
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
public FileAttributes dwFileAttributes;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
public int nFileSizeHigh;
public int nFileSizeLow;
public int dwReserved0;
public int dwReserved1;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
public string cFileName;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
public string cAlternateFileName;
}
static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);
static bool FindNextFilePInvokeRecursive(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
List<FileInformation> fileList = new List<FileInformation>();
List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
try
{
findHandle = FindFirstFileW(path + @"\*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
// Skip current directory and parent directory symbols that are returned.
if (findData.cFileName != "." && findData.cFileName != "..")
{
string fullPath = path + @"\" + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
List<FileInformation> subDirectoryFileList = new List<FileInformation>();
List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
{
fileList.AddRange(subDirectoryFileList);
directoryList.AddRange(subDirectoryDirectoryList);
}
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
}
}
}
while (FindNextFile(findHandle, out findData));
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
List<FileInformation> fileList = new List<FileInformation>();
object fileListLock = new object();
List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
object directoryListLock = new object();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
try
{
path = path.EndsWith(@"\") ? path : path + @"\";
findHandle = FindFirstFileW(path + @"*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
// Skip current directory and parent directory symbols that are returned.
if (findData.cFileName != "." && findData.cFileName != "..")
{
string fullPath = path + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
}
}
}
while (FindNextFile(findHandle, out findData));
directoryList.AsParallel().ForAll(x =>
{
List<FileInformation> subDirectoryFileList = new List<FileInformation>();
List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
if (FindNextFilePInvokeRecursive(x.FullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
{
lock (fileListLock)
{
fileList.AddRange(subDirectoryFileList);
}
lock (directoryListLock)
{
directoryList.AddRange(subDirectoryDirectoryList);
}
}
});
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
public class FileInformation
{
public string FullPath;
public DateTime LastWriteTime;
}
public class DirectoryInformation
{
public string FullPath;
public DateTime LastWriteTime;
}
Run Code Online (Sandbox Code Playgroud)
编辑:BK询问从FILETIME转换到DateTime:
public static class FILETIMEExtensions
{
public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME time)
{
ulong high = (ulong)time.dwHighDateTime;
ulong low = (ulong)time.dwLowDateTime;
long fileTime = (long)((high << 32) + low);
return DateTime.FromFileTimeUtc(fileTime);
}
}
Run Code Online (Sandbox Code Playgroud)
使用 LINQ 和并行任务
var stuff = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories);
Parallel.ForEach(stuff, p=>{ //do things in parallel.. });
//or this
var q = stuff.AsParallel().Where(x => p(x)).Orderby(x => k(x)).Select(x => f(x));
foreach (var e in q) a(e);
Run Code Online (Sandbox Code Playgroud)