dask · phofl · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/dask_expr/_expr.py b/dask_expr/_expr.py
@@ -2867,7 +2867,7 @@ def are_co_aligned(*exprs, allow_broadcast=True):
     ancestors = [set(non_blockwise_ancestors(e)) for e in exprs]
     unique_ancestors = {
         # Account for column projection within IO expressions
-        _tokenize_partial(item, ["columns", "_series"])
+        _tokenize_partial(item, ["columns", "_series", "_dataset_info_cache"])
         for item in flatten(ancestors, container=set)
     }
     # Don't check divisions or npartitions at all

diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py
@@ -617,7 +617,7 @@ class ReadParquetPyarrowFS(ReadParquet):
 
     @cached_property
     def normalized_path(self):
-        return pa_fs.FileSystem.from_uri(self.path)[1]
+        return _normalize_and_strip_protocol(self.path)
 
     @cached_property
     def fs(self):
@@ -639,21 +639,24 @@ def _dataset_info(self):
         dataset_info = {}
 
         path_normalized = self.normalized_path
-        # At this point we will post a couple of listbucket operations which
-        # includes the same data as a HEAD request.
-        # The information included here (see pyarrow FileInfo) are size, type,
-        # path and modified since timestamps
-        # This isn't free but realtively cheap (200-300ms or less for ~1k files)
-        finfo = self.fs.get_file_info(path_normalized)
-        if finfo.type == pa.fs.FileType.Directory:
+        # We'll first treat the path as if it was a directory since this is the
+        # most common case. Only if this fails, we'll treat it as a file. This
+        # way, the happy path performs one remote request instead of two if we
+        # were to check the type of the path first.
+        try:
+            # At this point we will post a listbucket request which includes the
+            # same data as a HEAD request. The information included here (see
+            # pyarrow FileInfo) are size, type, path and modified since
+            # timestamps This isn't free but realtively cheap (200-300ms or less
+            # for ~1k files)
             dataset_selector = pa_fs.FileSelector(path_normalized, recursive=True)
             all_files = [
                 finfo
                 for finfo in self.fs.get_file_info(dataset_selector)
                 if finfo.type == pa.fs.FileType.File
             ]
-        else:
-            all_files = [finfo]
+        except NotADirectoryError:
+            all_files = [self.fs.get_file_info(path_normalized)]
         # TODO: At this point we could verify if we're dealing with a very
         # inhomogeneous datasets already without reading any further data
 
@@ -1321,3 +1324,14 @@ def _read_partition_stats(part, fs, columns=None):
 
     # Helper function used by _extract_statistics
     return [_read_partition_stats(part, fs, columns=columns) for part in parts]
+
+
+def _normalize_and_strip_protocol(path):
+    protocol_separators = ["://", "::"]
+    for sep in protocol_separators:
+        split = path.split(sep, 1)
+        if len(split) > 1:
+            path = split[1]
+            break
+    path = path.rstrip("/")
+    return path