Skip to content

Commit

Permalink
Switch to pattern matching.
Browse files Browse the repository at this point in the history
  • Loading branch information
matz-e committed Dec 20, 2017
1 parent 3d378be commit 227afe1
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 14 deletions.
33 changes: 20 additions & 13 deletions lobster/core/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
import fnmatch
import math
import os

Expand All @@ -11,15 +12,16 @@
]


def flatten(files, exts=None):
def flatten(files, matches=None):
"""Flatten a list of directories or files to a single list of files.
Parameters
----------
files : str or list
A list of paths to expand. Can also be a string containing a path.
exts : list
Which file extensions to return.
matches : list
A list of patterns to match files against. Only successfully
matched files will be returned.
Returns
-------
Expand All @@ -28,6 +30,12 @@ def flatten(files, exts=None):
parameter `files`, optionally matching the extensions in
`exts`.
"""
def matchfn(fn):
base = os.path.basename(fn)
for m in matches:
if fnmatch.fnmatch(base, m):
return True
return False
res = []
if not isinstance(files, list):
files = [files]
Expand All @@ -37,8 +45,8 @@ def flatten(files, exts=None):
res.extend(fs.ls(entry))
elif fs.isfile(entry):
res.append(entry)
if exts:
return [fn for fn in res if os.path.splitext(fn)[1] in exts]
if matches:
return [fn for fn in res if matchfn(fn)]
return res


Expand Down Expand Up @@ -85,27 +93,26 @@ class Dataset(Configurable):
pointing to a single file or directory.
files_per_task : int
How many files to process in one task. Defaults to 1.
extensions: list
A list of file extensions to process. Defaults to `None` and
will use all files considered. File extensions should be given
with a leading period.
patterns: list
A list of shell-style file patterns to match filenames against.
Defaults to `None` and will use all files considered.
"""
_mutable = {}

def __init__(self, files, files_per_task=1, extensions=None):
def __init__(self, files, files_per_task=1, patterns=None):
self.files = files
self.files_per_task = files_per_task
self.extensions = extensions
self.patterns = patterns
self.total_units = 0

def validate(self):
return len(flatten(self.files, self.extensions)) > 0
return len(flatten(self.files, self.patterns)) > 0

def get_info(self):
dset = DatasetInfo()
dset.file_based = True

files = flatten(self.files, self.extensions)
files = flatten(self.files, self.patterns)
dset.tasksize = self.files_per_task
dset.total_units = len(files)
self.total_units = len(files)
Expand Down
5 changes: 4 additions & 1 deletion test/test_core_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,8 @@ def test_flatten(self):
info = Dataset(files=['spam']).get_info()
assert len(info.files) == 8

info = Dataset(files=['spam'], extensions=['.txt']).get_info()
info = Dataset(files=['spam'], patterns=['*.txt']).get_info()
assert len(info.files) == 5

info = Dataset(files=['spam'], patterns=['[12].txt']).get_info()
assert len(info.files) == 2

0 comments on commit 227afe1

Please sign in to comment.