Hel*_*nar 1 python directory file pattern-matching
假设我有这样的文件夹
rootfolder
|
/ \ \
01 02 03 ....
|
13_itemname.xml
Run Code Online (Sandbox Code Playgroud)
所以在我的rootfolder下,每个目录代表一个月,如01 02 03,在这些目录下我有他们的创建小时和项目名称的项目,如16_item1.xml,24_item1.xml等,你可能会猜到有几个项目和每个xml每小时创造一次.
现在我想做两件事:
我需要生成一个月的项目名称列表,即对于01,我有item1,item2和item3.
我需要过滤每个项目,例如item1:我想从01_item1.xml到24_item1.xml中读取每个项目.
我怎样才能以简单的方式在Python中实现这些目标?
这里有两种方法可以满足您的要求(如果我理解的话).一个有正则表达式,一个没有.你选择你喜欢哪一个;)
可能看起来像魔术的一点是"setdefault"线.有关说明,请参阅文档.我把它作为"读者的练习"来理解它是如何工作的;)
from os import listdir
from os.path import join
DATA_ROOT = "testdata"
def folder_items_no_regex(month_name):
# dict holding the items (assuming ordering is irrelevant)
items = {}
# 1. Loop through all filenames in said folder
for file in listdir( join( DATA_ROOT, month_name ) ):
date, name = file.split( "_", 1 )
# skip files that were not possible to split on "_"
if not date or not name:
continue
# ignore non-.xml files
if not name.endswith(".xml"):
continue
# cut off the ".xml" extension
name = name[0:-4]
# keep a list of filenames
items.setdefault( name, set() ).add( file )
return items
def folder_items_regex(month_name):
import re
# The pattern:
# 1. match the beginnning of line "^"
# 2. capture 1 or more digits ( \d+ )
# 3. match the "_"
# 4. capture any character (as few as possible ): (.*?)
# 5. match ".xml"
# 6. match the end of line "$"
pattern = re.compile( r"^(\d+)_(.*?)\.xml$" )
# dict holding the items (assuming ordering is irrelevant)
items = {}
# 1. Loop through all filenames in said folder
for file in listdir( join( DATA_ROOT, month_name ) ):
match = pattern.match( file )
if not match:
continue
date, name = match.groups()
# keep a list of filenames
items.setdefault( name, set() ).add( file )
return items
if __name__ == "__main__":
from pprint import pprint
data = folder_items_no_regex( "02" )
print "--- The dict ---------------"
pprint( data )
print "--- The items --------------"
pprint( sorted( data.keys() ) )
print "--- The files for item1 ---- "
pprint( sorted( data["item1"] ) )
data = folder_items_regex( "02" )
print "--- The dict ---------------"
pprint( data )
print "--- The items --------------"
pprint( sorted( data.keys() ) )
print "--- The files for item1 ---- "
pprint( sorted( data["item1"] ) )
Run Code Online (Sandbox Code Playgroud)