Speed-up version parsing and comparison (#22973)

The VALID_VERSION regex didn't check that the version string was
completely valid, only that a prefix of it was. This version ensures
the entire string represents a valid version.

This makes a few related changes.

1. Make the SEGMENT_REGEX identify *which* arm it matches by what groups
   are populated, including whether it's a string or int component or a
   separator all at once.

2. Use the updated regex to parse the input once with a findall rather
   than twice, once with findall and once with split, since the version
   components and separators can be distinguished by their group status.

3. Rather than "convert to int, on exception stay string," if the int
   group is set then convert to int, if not then construct an instance
   of the VersionStrComponent class

4. VersionStrComponent now implements all of the special string
   comparison logic as part of its __lt__ and __eq__ methods to deal
   with infinity versions and also overloads comparison with integers.

5. Version now uses direct tuple comparison since it has no per-element
   special logic outside the VersionStrComponent class.

Co-authored-by: Massimiliano Culpo <massimiliano.culpo@gmail.com>
This commit is contained in:
Tom Scogland 2021-06-04 00:23:37 -07:00 committed by GitHub
parent ea390198f4
commit b63a8b3e27
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 84 additions and 52 deletions

View file

@ -565,3 +565,14 @@ def test_list_highest():
assert vl2.highest_numeric() is None assert vl2.highest_numeric() is None
assert vl2.preferred() == Version('develop') assert vl2.preferred() == Version('develop')
assert vl2.lowest() == Version('master') assert vl2.lowest() == Version('master')
@pytest.mark.parametrize('version_str', [
"foo 1.2.0",
"!",
"1!2"
])
def test_invalid_versions(version_str):
"""Ensure invalid versions are rejected with a ValueError"""
with pytest.raises(ValueError):
Version(version_str)

View file

@ -37,21 +37,15 @@
__all__ = ['Version', 'VersionRange', 'VersionList', 'ver'] __all__ = ['Version', 'VersionRange', 'VersionList', 'ver']
# Valid version characters # Valid version characters
VALID_VERSION = re.compile(r'[A-Za-z0-9_.-]') VALID_VERSION = re.compile(r'^[A-Za-z0-9_.-]+$')
# regex for version segments # regex for version segments
SEGMENT_REGEX = re.compile(r'[a-zA-Z]+|[0-9]+') SEGMENT_REGEX = re.compile(r'(?:(?P<num>[0-9]+)|(?P<str>[a-zA-Z]+))(?P<sep>[_.-]*)')
# Infinity-like versions. The order in the list implies the comparison rules # Infinity-like versions. The order in the list implies the comparison rules
infinity_versions = ['develop', 'main', 'master', 'head', 'trunk'] infinity_versions = ['develop', 'main', 'master', 'head', 'trunk']
iv_min_len = min(len(s) for s in infinity_versions)
def int_if_int(string):
"""Convert a string to int if possible. Otherwise, return a string."""
try:
return int(string)
except ValueError:
return string
def coerce_versions(a, b): def coerce_versions(a, b):
@ -96,26 +90,86 @@ def coercing_method(a, b, *args, **kwargs):
return coercing_method return coercing_method
class VersionStrComponent(object):
# NOTE: this is intentionally not a UserString, the abc instanceof
# check is slow enough to eliminate all gains
__slots__ = ['inf_ver', 'data']
def __init__(self, string):
self.inf_ver = None
self.data = string
if len(string) >= iv_min_len:
try:
self.inf_ver = infinity_versions.index(string)
except ValueError:
pass
def __hash__(self):
return hash(self.data)
def __str__(self):
return self.data
def __eq__(self, other):
if isinstance(other, VersionStrComponent):
return self.data == other.data
return self.data == other
def __lt__(self, other):
if isinstance(other, VersionStrComponent):
if self.inf_ver is not None:
if other.inf_ver is not None:
return self.inf_ver > other.inf_ver
return False
if other.inf_ver is not None:
return True
return self.data < other.data
if self.inf_ver is not None:
return False
# Numbers are always "newer" than letters.
# This is for consistency with RPM. See patch
# #60884 (and details) from bugzilla #50977 in
# the RPM project at rpm.org. Or look at
# rpmvercmp.c if you want to see how this is
# implemented there.
if isinstance(other, int):
return True
if isinstance(other, str):
return self < VersionStrComponent(other)
# If we get here, it's an unsupported comparison
raise ValueError("VersionStrComponent can only be compared with itself, "
"int and str")
def __gt__(self, other):
return not self.__lt__(other)
class Version(object): class Version(object):
"""Class to represent versions""" """Class to represent versions"""
__slots__ = ['version', 'separators', 'string']
def __init__(self, string): def __init__(self, string):
if not isinstance(string, str): if not isinstance(string, str):
string = str(string) string = str(string)
if not VALID_VERSION.match(string):
raise ValueError("Bad characters in version string: %s" % string)
# preserve the original string, but trimmed. # preserve the original string, but trimmed.
string = string.strip() string = string.strip()
self.string = string self.string = string
# Split version into alphabetical and numeric segments if not VALID_VERSION.match(string):
segments = SEGMENT_REGEX.findall(string) raise ValueError("Bad characters in version string: %s" % string)
self.version = tuple(int_if_int(seg) for seg in segments)
# Store the separators from the original version string as well. # Split version into alphabetical and numeric segments simultaneously
self.separators = tuple(SEGMENT_REGEX.split(string)[1:]) segments = SEGMENT_REGEX.findall(string)
self.version = tuple(
int(m[0]) if m[0] else VersionStrComponent(m[1]) for m in segments
)
self.separators = tuple(m[2] for m in segments)
@property @property
def dotted(self): def dotted(self):
@ -277,40 +331,8 @@ def __lt__(self, other):
if other is None: if other is None:
return False return False
# Coerce if other is not a Version # Use tuple comparison assisted by VersionStrComponent for performance
# simple equality test first. return self.version < other.version
if self.version == other.version:
return False
# Standard comparison of two numeric versions
for a, b in zip(self.version, other.version):
if a == b:
continue
else:
if a in infinity_versions:
if b in infinity_versions:
return (infinity_versions.index(a) >
infinity_versions.index(b))
else:
return False
if b in infinity_versions:
return True
# Neither a nor b is infinity
# Numbers are always "newer" than letters.
# This is for consistency with RPM. See patch
# #60884 (and details) from bugzilla #50977 in
# the RPM project at rpm.org. Or look at
# rpmvercmp.c if you want to see how this is
# implemented there.
if type(a) != type(b):
return type(b) == int
else:
return a < b
# If the common prefix is equal, the one
# with more segments is bigger.
return len(self.version) < len(other.version)
@coerced @coerced
def __eq__(self, other): def __eq__(self, other):
@ -595,7 +617,6 @@ def __init__(self, vlist=None):
else: else:
self.versions = [vlist] self.versions = [vlist]
else: else:
vlist = list(vlist)
for v in vlist: for v in vlist:
self.add(ver(v)) self.add(ver(v))