@@ -78,33 +78,12 @@ def _is_case_sensitive(flavour):
7878}
7979
8080
81- @functools .lru_cache ()
82- def _make_selector (pattern_parts , flavour , case_sensitive ):
83- pat = pattern_parts [0 ]
84- if not pat :
85- return _TerminatingSelector ()
86- if pat == '**' :
87- child_parts_idx = 1
88- while child_parts_idx < len (pattern_parts ) and pattern_parts [child_parts_idx ] == '**' :
89- child_parts_idx += 1
90- child_parts = pattern_parts [child_parts_idx :]
91- if '**' in child_parts :
92- cls = _DoubleRecursiveWildcardSelector
93- else :
94- cls = _RecursiveWildcardSelector
95- else :
96- child_parts = pattern_parts [1 :]
97- if pat == '..' :
98- cls = _ParentSelector
99- elif '**' in pat :
100- raise ValueError ("Invalid pattern: '**' can only be an entire path component" )
101- else :
102- cls = _WildcardSelector
103- return cls (pat , child_parts , flavour , case_sensitive )
104-
105-
10681@functools .lru_cache (maxsize = 256 )
10782def _compile_pattern (pat , case_sensitive ):
83+ """Compile given glob pattern to a re.Pattern object (observing case
84+ sensitivity), or None if the pattern should match everything."""
85+ if pat == '*' :
86+ return None
10887 flags = re .NOFLAG if case_sensitive else re .IGNORECASE
10988 return re .compile (fnmatch .translate (pat ), flags ).match
11089
@@ -127,7 +106,11 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
127106 # Match the start of the path, or just after a path separator
128107 parts = ['^' ]
129108 for part in pattern_lines .splitlines (keepends = True ):
130- if part == '**\n ' :
109+ if part == '*\n ' :
110+ part = r'.+\n'
111+ elif part == '*' :
112+ part = r'.+'
113+ elif part == '**\n ' :
131114 # '**/' component: we use '[\s\S]' rather than '.' so that path
132115 # separators (i.e. newlines) are matched. The trailing '^' ensures
133116 # we terminate after a path separator (i.e. on a new line).
@@ -154,114 +137,70 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
154137 return re .compile ('' .join (parts ), flags = flags )
155138
156139
157- class _Selector :
158- """A selector matches a specific glob pattern part against the children
159- of a given path."""
160-
161- def __init__ (self , child_parts , flavour , case_sensitive ):
162- self .child_parts = child_parts
163- if child_parts :
164- self .successor = _make_selector (child_parts , flavour , case_sensitive )
165- self .dironly = True
166- else :
167- self .successor = _TerminatingSelector ()
168- self .dironly = False
169-
170- def select_from (self , parent_path , follow_symlinks ):
171- """Iterate over all child paths of `parent_path` matched by this
172- selector. This can contain parent_path itself."""
173- path_cls = type (parent_path )
174- scandir = path_cls ._scandir
175- if not parent_path .is_dir ():
176- return iter ([])
177- return self ._select_from (parent_path , scandir , follow_symlinks )
178-
179-
180- class _TerminatingSelector :
181-
182- def _select_from (self , parent_path , scandir , follow_symlinks ):
183- yield parent_path
184-
185-
186- class _ParentSelector (_Selector ):
187-
188- def __init__ (self , name , child_parts , flavour , case_sensitive ):
189- _Selector .__init__ (self , child_parts , flavour , case_sensitive )
190-
191- def _select_from (self , parent_path , scandir , follow_symlinks ):
192- path = parent_path ._make_child_relpath ('..' )
193- for p in self .successor ._select_from (path , scandir , follow_symlinks ):
194- yield p
195-
196-
197- class _WildcardSelector (_Selector ):
198-
199- def __init__ (self , pat , child_parts , flavour , case_sensitive ):
200- _Selector .__init__ (self , child_parts , flavour , case_sensitive )
201- if case_sensitive is None :
202- # TODO: evaluate case-sensitivity of each directory in _select_from()
203- case_sensitive = _is_case_sensitive (flavour )
204- self .match = _compile_pattern (pat , case_sensitive )
205-
206- def _select_from (self , parent_path , scandir , follow_symlinks ):
207- follow_dirlinks = True if follow_symlinks is None else follow_symlinks
140+ def _select_children (parent_paths , dir_only , follow_symlinks , match ):
141+ """Yield direct children of given paths, filtering by name and type."""
142+ if follow_symlinks is None :
143+ follow_symlinks = True
144+ for parent_path in parent_paths :
208145 try :
209146 # We must close the scandir() object before proceeding to
210147 # avoid exhausting file descriptors when globbing deep trees.
211- with scandir ( parent_path ) as scandir_it :
148+ with parent_path . _scandir ( ) as scandir_it :
212149 entries = list (scandir_it )
213150 except OSError :
214151 pass
215152 else :
216153 for entry in entries :
217- if self . dironly :
154+ if dir_only :
218155 try :
219- if not entry .is_dir (follow_symlinks = follow_dirlinks ):
156+ if not entry .is_dir (follow_symlinks = follow_symlinks ):
220157 continue
221158 except OSError :
222159 continue
223160 name = entry .name
224- if self .match (name ):
225- path = parent_path ._make_child_relpath (name )
226- for p in self .successor ._select_from (path , scandir , follow_symlinks ):
227- yield p
228-
161+ if match is None or match (name ):
162+ yield parent_path ._make_child_relpath (name )
229163
230- class _RecursiveWildcardSelector (_Selector ):
231-
232- def __init__ (self , pat , child_parts , flavour , case_sensitive ):
233- _Selector .__init__ (self , child_parts , flavour , case_sensitive )
234-
235- def _iterate_directories (self , parent_path , follow_symlinks ):
236- yield parent_path
237- for dirpath , dirnames , _ in parent_path .walk (follow_symlinks = follow_symlinks ):
238- for dirname in dirnames :
239- yield dirpath ._make_child_relpath (dirname )
240-
241- def _select_from (self , parent_path , scandir , follow_symlinks ):
242- follow_dirlinks = False if follow_symlinks is None else follow_symlinks
243- successor_select = self .successor ._select_from
244- for starting_point in self ._iterate_directories (parent_path , follow_dirlinks ):
245- for p in successor_select (starting_point , scandir , follow_symlinks ):
246- yield p
247-
248-
249- class _DoubleRecursiveWildcardSelector (_RecursiveWildcardSelector ):
250- """
251- Like _RecursiveWildcardSelector, but also de-duplicates results from
252- successive selectors. This is necessary if the pattern contains
253- multiple non-adjacent '**' segments.
254- """
255164
256- def _select_from (self , parent_path , scandir , follow_symlinks ):
257- yielded = set ()
258- try :
259- for p in super ()._select_from (parent_path , scandir , follow_symlinks ):
260- if p not in yielded :
261- yield p
262- yielded .add (p )
263- finally :
264- yielded .clear ()
165+ def _select_recursive (parent_paths , dir_only , follow_symlinks ):
166+ """Yield given paths and all their subdirectories, recursively."""
167+ if follow_symlinks is None :
168+ follow_symlinks = False
169+ for parent_path in parent_paths :
170+ paths = [parent_path ]
171+ while paths :
172+ path = paths .pop ()
173+ yield path
174+ try :
175+ # We must close the scandir() object before proceeding to
176+ # avoid exhausting file descriptors when globbing deep trees.
177+ with path ._scandir () as scandir_it :
178+ entries = list (scandir_it )
179+ except OSError :
180+ pass
181+ else :
182+ for entry in entries :
183+ try :
184+ if entry .is_dir (follow_symlinks = follow_symlinks ):
185+ paths .append (path ._make_child_relpath (entry .name ))
186+ continue
187+ except OSError :
188+ pass
189+ if not dir_only :
190+ yield path ._make_child_relpath (entry .name )
191+
192+
193+ def _select_unique (paths ):
194+ """Yields the given paths, filtering out duplicates."""
195+ yielded = set ()
196+ try :
197+ for path in paths :
198+ raw_path = path ._raw_path
199+ if raw_path not in yielded :
200+ yield path
201+ yielded .add (raw_path )
202+ finally :
203+ yielded .clear ()
265204
266205
267206#
@@ -1056,51 +995,109 @@ def _scandir(self):
1056995 return os .scandir (self )
1057996
1058997 def _make_child_relpath (self , name ):
998+ sep = self ._flavour .sep
999+ lines_name = name .replace ('\n ' , sep )
1000+ lines_str = self ._lines
10591001 path_str = str (self )
10601002 tail = self ._tail
10611003 if tail :
1062- path_str = f'{ path_str } { self ._flavour .sep } { name } '
1004+ path_str = f'{ path_str } { sep } { name } '
1005+ lines_str = f'{ lines_str } \n { lines_name } '
10631006 elif path_str != '.' :
10641007 path_str = f'{ path_str } { name } '
1008+ lines_str = f'{ lines_str } { lines_name } '
10651009 else :
10661010 path_str = name
1011+ lines_str = lines_name
10671012 path = self .with_segments (path_str )
10681013 path ._str = path_str
10691014 path ._drv = self .drive
10701015 path ._root = self .root
10711016 path ._tail_cached = tail + [name ]
1017+ path ._lines_cached = lines_str
10721018 return path
10731019
10741020 def glob (self , pattern , * , case_sensitive = None , follow_symlinks = None ):
10751021 """Iterate over this subtree and yield all existing files (of any
10761022 kind, including directories) matching the given relative pattern.
10771023 """
10781024 sys .audit ("pathlib.Path.glob" , self , pattern )
1079- if not pattern :
1080- raise ValueError ("Unacceptable pattern: {!r}" .format (pattern ))
1081- drv , root , pattern_parts = self ._parse_path (pattern )
1082- if drv or root :
1083- raise NotImplementedError ("Non-relative patterns are unsupported" )
1084- if pattern [- 1 ] in (self ._flavour .sep , self ._flavour .altsep ):
1085- pattern_parts .append ('' )
1086- selector = _make_selector (tuple (pattern_parts ), self ._flavour , case_sensitive )
1087- for p in selector .select_from (self , follow_symlinks ):
1088- yield p
1025+ return self ._glob (pattern , case_sensitive , follow_symlinks )
10891026
10901027 def rglob (self , pattern , * , case_sensitive = None , follow_symlinks = None ):
10911028 """Recursively yield all existing files (of any kind, including
10921029 directories) matching the given relative pattern, anywhere in
10931030 this subtree.
10941031 """
10951032 sys .audit ("pathlib.Path.rglob" , self , pattern )
1096- drv , root , pattern_parts = self ._parse_path (pattern )
1097- if drv or root :
1033+ return self ._glob (f'**/{ pattern } ' , case_sensitive , follow_symlinks )
1034+
1035+ def _glob (self , pattern , case_sensitive , follow_symlinks ):
1036+ path_pattern = self .with_segments (pattern )
1037+ if path_pattern .drive or path_pattern .root :
10981038 raise NotImplementedError ("Non-relative patterns are unsupported" )
1099- if pattern and pattern [- 1 ] in (self ._flavour .sep , self ._flavour .altsep ):
1039+ elif not path_pattern ._tail :
1040+ raise ValueError ("Unacceptable pattern: {!r}" .format (pattern ))
1041+
1042+ pattern_parts = list (path_pattern ._tail )
1043+ if pattern [- 1 ] in (self ._flavour .sep , self ._flavour .altsep ):
1044+ # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
11001045 pattern_parts .append ('' )
1101- selector = _make_selector (("**" ,) + tuple (pattern_parts ), self ._flavour , case_sensitive )
1102- for p in selector .select_from (self , follow_symlinks ):
1103- yield p
1046+ if pattern_parts [- 1 ] == '**' :
1047+ # GH-70303: '**' only matches directories. Add trailing slash.
1048+ pattern_parts .append ('' )
1049+
1050+ if case_sensitive is None :
1051+ # TODO: evaluate case-sensitivity of each directory in _select_children().
1052+ case_sensitive = _is_case_sensitive (self ._flavour )
1053+
1054+ # If symlinks are handled consistently, and the pattern does not
1055+ # contain '..' components, then we can use a 'walk-and-match' strategy
1056+ # when expanding '**' wildcards. When a '**' wildcard is encountered,
1057+ # all following pattern parts are immediately consumed and used to
1058+ # build a `re.Pattern` object. This pattern is used to filter the
1059+ # recursive walk. As a result, pattern parts following a '**' wildcard
1060+ # do not perform any filesystem access, which can be much faster!
1061+ filter_paths = follow_symlinks is not None and '..' not in pattern_parts
1062+ deduplicate_paths = False
1063+ paths = iter ([self ] if self .is_dir () else [])
1064+ part_idx = 0
1065+ while part_idx < len (pattern_parts ):
1066+ part = pattern_parts [part_idx ]
1067+ part_idx += 1
1068+ if part == '' :
1069+ # Trailing slash.
1070+ pass
1071+ elif part == '..' :
1072+ paths = (path ._make_child_relpath ('..' ) for path in paths )
1073+ elif part == '**' :
1074+ # Consume adjacent '**' components.
1075+ while part_idx < len (pattern_parts ) and pattern_parts [part_idx ] == '**' :
1076+ part_idx += 1
1077+
1078+ if filter_paths and part_idx < len (pattern_parts ) and pattern_parts [part_idx ] != '' :
1079+ dir_only = pattern_parts [- 1 ] == ''
1080+ paths = _select_recursive (paths , dir_only , follow_symlinks )
1081+
1082+ # Filter out paths that don't match pattern.
1083+ prefix_len = len (self ._make_child_relpath ('_' )._lines ) - 1
1084+ match = _compile_pattern_lines (path_pattern ._lines , case_sensitive ).match
1085+ paths = (path for path in paths if match (path ._lines [prefix_len :]))
1086+ return paths
1087+
1088+ dir_only = part_idx < len (pattern_parts )
1089+ paths = _select_recursive (paths , dir_only , follow_symlinks )
1090+ if deduplicate_paths :
1091+ # De-duplicate if we've already seen a '**' component.
1092+ paths = _select_unique (paths )
1093+ deduplicate_paths = True
1094+ elif '**' in part :
1095+ raise ValueError ("Invalid pattern: '**' can only be an entire path component" )
1096+ else :
1097+ dir_only = part_idx < len (pattern_parts )
1098+ match = _compile_pattern (part , case_sensitive )
1099+ paths = _select_children (paths , dir_only , follow_symlinks , match )
1100+ return paths
11041101
11051102 def walk (self , top_down = True , on_error = None , follow_symlinks = False ):
11061103 """Walk the directory tree from this directory, similar to os.walk()."""
0 commit comments