Skip to content

Commit

Permalink
Smarter (and looser) link equivalency logic
Browse files Browse the repository at this point in the history
  • Loading branch information
uranusjr committed Jul 1, 2021
1 parent 4704da4 commit 48fd6a9
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 15 deletions.
7 changes: 7 additions & 0 deletions news/10002.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
New resolver: Loosen URL comparison logic when checking for direct URL reference
equivalency. The logic includes the following notable characteristics:

* The authentication part of the URL is explicitly ignored.
* Most of the fragment part, including ``egg=``, is explicitly ignored. Only
``subdirectory=`` is kept.
* The query part of the URL is hashed to allow ordering differences.
60 changes: 57 additions & 3 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import functools
import logging
import os
import posixpath
import re
import urllib.parse
from typing import TYPE_CHECKING, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union

from pip._internal.utils.filetypes import WHEEL_EXTENSION
from pip._internal.utils.hashes import Hashes
Expand All @@ -17,6 +19,8 @@
if TYPE_CHECKING:
from pip._internal.index.collector import HTMLPage

logger = logging.getLogger(__name__)


class Link(KeyBasedCompareMixin):
"""Represents a parsed link from a Package Index's simple URL
Expand Down Expand Up @@ -242,7 +246,57 @@ def is_hash_allowed(self, hashes):
return hashes.is_hash_allowed(self.hash_name, hex_digest=self.hash)


# TODO: Relax this comparison logic to ignore, for example, fragments.
class _CleanResult(NamedTuple):
"""Convert link for equivalency check.
This is used in the resolver to check whether two URL-specified requirements
likely point to the same distribution and can be considered equivalent. This
equivalency logic avoids comparing URLs literally, which can be too strict
(e.g. "a=1&b=2" vs "b=2&a=1") and produce conflicts unexpecting to users.
Currently this does three things:
1. Drop the basic auth part. This is technically wrong since a server can
serve different content based on auth, but if it does that, it is even
impossible to guarantee two URLs without auth are equivalent, since
the user can input different auth information when prompted. So the
practical solution is to assume the auth doesn't affect the response.
2. Parse the query to avoid the ordering issue. Note that ordering under the
same key in the query are NOT cleaned; i.e. "a=1&a=2" and "a=2&a=1" are
still considered different.
3. Explicitly drop most of the fragment part, except ``subdirectory=``,
since it should not affect the downloaded content. Note that this drops
the "egg=" part historically used to denote the requested project (and
extras), which is wrong in the strictest sense, but too many people are
supplying it inconsistently to cause superfluous resolution conflicts, so
we choose to also ignore them.
"""

parsed: urllib.parse.SplitResult
query: Dict[str, List[str]]
subdirectory: str

@classmethod
def from_link(cls, link: Link) -> "_CleanResult":
parsed = link._parsed_url
netloc = parsed.netloc.rsplit("@", 1)[-1]
fragment = urllib.parse.parse_qs(parsed.fragment)
if "egg" in fragment:
logger.debug("Ignoring egg= fragment in %s", link)
try:
# If there are multiple subdirectory values, use the first one.
# This matches the behaviour of Link.subdirectory_fragment.
subdirectory = fragment["subdirectory"][0]
except (IndexError, KeyError):
subdirectory = ""
return cls(
parsed=parsed._replace(netloc=netloc, query="", fragment=""),
query=urllib.parse.parse_qs(parsed.query),
subdirectory=subdirectory,
)


@functools.lru_cache(maxsize=None)
def links_equivalent(link1, link2):
# type: (Link, Link) -> bool
return link1 == link2
return _CleanResult.from_link(link1) == _CleanResult.from_link(link2)
13 changes: 2 additions & 11 deletions tests/functional/test_install_reqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,25 +423,16 @@ def test_constraints_constrain_to_local(script, data, resolver_variant):
assert 'Running setup.py install for singlemodule' in result.stdout


def test_constrained_to_url_install_same_url(script, data, resolver_variant):
def test_constrained_to_url_install_same_url(script, data):
to_install = data.src.joinpath("singlemodule")
constraints = path_to_url(to_install) + "#egg=singlemodule"
script.scratch_path.joinpath("constraints.txt").write_text(constraints)
result = script.pip(
'install', '--no-index', '-f', data.find_links, '-c',
script.scratch_path / 'constraints.txt', to_install,
allow_stderr_warning=True,
expect_error=(resolver_variant == "2020-resolver"),
)
if resolver_variant == "2020-resolver":
assert 'Cannot install singlemodule 0.0.1' in result.stderr, str(result)
assert (
'because these package versions have conflicting dependencies.'
in result.stderr
), str(result)
else:
assert ('Running setup.py install for singlemodule'
in result.stdout), str(result)
assert 'Running setup.py install for singlemodule' in result.stdout, str(result)


def test_double_install_spurious_hash_mismatch(
Expand Down
90 changes: 90 additions & 0 deletions tests/functional/test_new_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -1791,6 +1791,96 @@ def test_new_resolver_avoids_incompatible_wheel_tags_in_constraint_url(
assert_not_installed(script, "dep")


@pytest.mark.parametrize(
"suffixes_equivalent, depend_suffix, request_suffix",
[
pytest.param(
True,
"#egg=foo",
"",
id="drop-depend-egg",
),
pytest.param(
True,
"",
"#egg=foo",
id="drop-request-egg",
),
pytest.param(
True,
"#subdirectory=bar&egg=foo",
"#subdirectory=bar&egg=bar",
id="drop-egg-only",
),
pytest.param(
True,
"#subdirectory=bar&egg=foo",
"#egg=foo&subdirectory=bar",
id="fragment-ordering",
),
pytest.param(
True,
"?a=1&b=2",
"?b=2&a=1",
id="query-opordering",
),
pytest.param(
False,
"#sha512=1234567890abcdef",
"#sha512=abcdef1234567890",
id="different-keys",
),
pytest.param(
False,
"#sha512=1234567890abcdef",
"#md5=1234567890abcdef",
id="different-values",
),
pytest.param(
False,
"#subdirectory=bar&egg=foo",
"#subdirectory=rex",
id="drop-egg-still-different",
),
],
)
def test_new_resolver_direct_url_equivalent(
tmp_path,
script,
suffixes_equivalent,
depend_suffix,
request_suffix,
):
pkga = create_basic_wheel_for_package(script, name="pkga", version="1")
pkgb = create_basic_wheel_for_package(
script,
name="pkgb",
version="1",
depends=[f"pkga@{path_to_url(pkga)}{depend_suffix}"],
)

# Make pkgb visible via --find-links, but not pkga.
find_links = tmp_path.joinpath("find_links")
find_links.mkdir()
with open(pkgb, "rb") as f:
find_links.joinpath(pkgb.name).write_bytes(f.read())

# Install pkgb from --find-links, and pkga directly but from a different
# URL suffix as specified in pkgb. This should work!
script.pip(
"install",
"--no-cache-dir", "--no-index",
"--find-links", str(find_links),
f"{path_to_url(pkga)}{request_suffix}", "pkgb",
expect_error=(not suffixes_equivalent),
)

if suffixes_equivalent:
assert_installed(script, pkga="1", pkgb="1")
else:
assert_not_installed(script, "pkga", "pkgb")


def test_new_resolver_direct_url_with_extras(tmp_path, script):
pkg1 = create_basic_wheel_for_package(script, name="pkg1", version="1")
pkg2 = create_basic_wheel_for_package(
Expand Down
55 changes: 54 additions & 1 deletion tests/unit/test_link.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from pip._internal.models.link import Link
from pip._internal.models.link import Link, links_equivalent
from pip._internal.utils.hashes import Hashes


Expand Down Expand Up @@ -138,3 +138,56 @@ def test_is_hash_allowed__none_hashes(self, hashes, expected):
def test_is_vcs(self, url, expected):
link = Link(url)
assert link.is_vcs is expected


@pytest.mark.parametrize(
"url1, url2",
[
pytest.param(
"https://example.com/foo#egg=foo",
"https://example.com/foo",
id="drop-egg",
),
pytest.param(
"https://example.com/foo#subdirectory=bar&egg=foo",
"https://example.com/foo#subdirectory=bar&egg=bar",
id="drop-egg-only",
),
pytest.param(
"https://example.com/foo#subdirectory=bar&egg=foo",
"https://example.com/foo#egg=foo&subdirectory=bar",
id="fragment-ordering",
),
pytest.param(
"https://example.com/foo?a=1&b=2",
"https://example.com/foo?b=2&a=1",
id="query-opordering",
),
],
)
def test_links_equivalent(url1, url2):
assert links_equivalent(Link(url1), Link(url2))


@pytest.mark.parametrize(
"url1, url2",
[
pytest.param(
"https://example.com/foo#sha512=1234567890abcdef",
"https://example.com/foo#sha512=abcdef1234567890",
id="different-keys",
),
pytest.param(
"https://example.com/foo#sha512=1234567890abcdef",
"https://example.com/foo#md5=1234567890abcdef",
id="different-values",
),
pytest.param(
"https://example.com/foo#subdirectory=bar&egg=foo",
"https://example.com/foo#subdirectory=rex",
id="drop-egg-still-different",
),
],
)
def test_links_equivalent_false(url1, url2):
assert not links_equivalent(Link(url1), Link(url2))

0 comments on commit 48fd6a9

Please sign in to comment.