mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
Recent package updates on PyPI have introduced package description
pages that have extra newlines in addition to the newline character
separating the complete URLs for the difference package versions.
These extra newlines usually show up before the closing angle bracket
character ('>') of the opening half of the anchor tag.
This broke pip_download.py, because it uses a regex to crack out
various data items (file name, download path, hash algorithm and hash
value) from the download page. The regex attempts the whole anchor
element up to and including the closing '</a>' tag, which fails because
the '.' in a regex matches any character, except a newline. This failure
causes all lines in the package descriptor page to be rejected as not
matching the search pattern, so the package with a page in this format
can never be recognized.
This patch works around this formatting issue by adding the flag
re.DOTALL to the regex search call, making the regex '.' character match
the newline as well, so that the regex can match the complete anchor
element across a line break as well.
Change-Id: Ia56f87c54e0d9cad97b7e0ffbcce8f4c0f715c44
Reviewed-on: http://gerrit.cloudera.org:8080/23026
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Reviewed-by: Michael Smith <michael.smith@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
185 lines
7.0 KiB
Python
Executable File
185 lines
7.0 KiB
Python
Executable File
#!/usr/bin/python3
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Implement the basic 'pip download' functionality in a way that gives us more control
|
|
# over which archive type is downloaded and what post-download steps are executed.
|
|
# This script requires Python 3.
|
|
|
|
import hashlib
|
|
import multiprocessing.pool
|
|
import os
|
|
import os.path
|
|
import re
|
|
import sys
|
|
from random import randint
|
|
from threading import Event
|
|
from time import sleep
|
|
import signal
|
|
import subprocess
|
|
|
|
NUM_DOWNLOAD_ATTEMPTS = 8
|
|
|
|
PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
|
|
|
|
# The requirement files that list all of the required packages and versions.
|
|
REQUIREMENTS_FILES = ['requirements.txt', 'setuptools-requirements.txt',
|
|
'kudu-requirements.txt', 'adls-requirements.txt',
|
|
'py2-requirements.txt', 'py3-requirements.txt',
|
|
'gcovr-requirements.txt']
|
|
|
|
|
|
def check_digest(filename, algorithm, expected_digest):
|
|
try:
|
|
supported_algorithms = hashlib.algorithms_available
|
|
except AttributeError:
|
|
# Fallback to hardcoded set if hashlib.algorithms_available doesn't exist.
|
|
supported_algorithms = set(['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'])
|
|
if algorithm not in supported_algorithms:
|
|
print('Hash algorithm {0} is not supported by hashlib'.format(algorithm))
|
|
return False
|
|
h = hashlib.new(algorithm)
|
|
h.update(open(filename, mode='rb').read())
|
|
actual_digest = h.hexdigest()
|
|
return actual_digest == expected_digest
|
|
|
|
|
|
def retry(func):
|
|
'''Retry decorator.'''
|
|
|
|
def wrapper(*args, **kwargs):
|
|
for try_num in range(NUM_DOWNLOAD_ATTEMPTS):
|
|
if kwargs['is_canceled'] and kwargs['is_canceled'].is_set():
|
|
print('Cancelling {} download after {} tries.'.format(args, try_num))
|
|
return False
|
|
elif try_num > 0:
|
|
sleep_len = randint(5, 10 * 2 ** try_num)
|
|
print('Sleeping for {0} seconds before retrying'.format(sleep_len))
|
|
sleep(sleep_len)
|
|
try:
|
|
result = func(*args, **kwargs)
|
|
if result:
|
|
return result
|
|
except Exception as e:
|
|
print(e)
|
|
print('Download failed after several attempts.')
|
|
sys.exit(1)
|
|
|
|
return wrapper
|
|
|
|
|
|
def get_package_info(pkg_name, pkg_version, is_canceled=None):
|
|
'''Returns the file name, path, hash algorithm and digest of the package.'''
|
|
fail_return_value = (None, None, None, None)
|
|
if is_canceled and is_canceled.is_set():
|
|
return fail_return_value
|
|
# We store the matching result in the candidates list instead of returning right away
|
|
# to sort them and return the first value in alphabetical order. This ensures that the
|
|
# same result is always returned even if the ordering changed on the server.
|
|
candidates = []
|
|
normalized_name = re.sub(r"[-_.]+", "-", pkg_name).lower()
|
|
url = '{0}/simple/{1}/'.format(PYPI_MIRROR, normalized_name)
|
|
print('Getting package info from {0}'.format(url))
|
|
# The web page should be in PEP 503 format (https://www.python.org/dev/peps/pep-0503/).
|
|
# We parse the page with regex instead of an html parser because that requires
|
|
# downloading an extra package before running this script. Since the HTML is guaranteed
|
|
# to be formatted according to PEP 503, this is acceptable.
|
|
pkg_info = subprocess.check_output(
|
|
["wget", "-q", "-O", "-", url], universal_newlines=True)
|
|
regex = r'<a .*?href=\".*?packages/(.*?)#(.*?)=(.*?)\".*?>(.*?)<\/a>'
|
|
for match in re.finditer(regex, pkg_info, flags=re.DOTALL):
|
|
path = match.group(1)
|
|
hash_algorithm = match.group(2)
|
|
digest = match.group(3)
|
|
file_name = match.group(4)
|
|
# Make sure that we consider only non Wheel archives, because those are not supported.
|
|
if (file_name.endswith('-{0}.tar.gz'.format(pkg_version))
|
|
or file_name.endswith('-{0}.tar.bz2'.format(pkg_version))
|
|
or file_name.endswith('-{0}.zip'.format(pkg_version))):
|
|
candidates.append((file_name, path, hash_algorithm, digest))
|
|
if not candidates:
|
|
print('Could not find archive to download for {0} {1}'.format(pkg_name, pkg_version))
|
|
return fail_return_value
|
|
return sorted(candidates)[0]
|
|
|
|
|
|
@retry
|
|
def download_package(pkg_name, pkg_version, is_canceled=None):
|
|
file_name, path, hash_algorithm, expected_digest = get_package_info(pkg_name,
|
|
pkg_version)
|
|
if not file_name:
|
|
return False
|
|
if os.path.isfile(file_name) and check_digest(file_name, hash_algorithm,
|
|
expected_digest):
|
|
print('File with matching digest already exists, skipping {0}'.format(file_name))
|
|
return True
|
|
if is_canceled and is_canceled.is_set():
|
|
return False
|
|
pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
|
|
print('Downloading {0} from {1}'.format(file_name, pkg_url))
|
|
if 0 != subprocess.check_call(["wget", pkg_url, "-q", "-O", file_name]):
|
|
return False
|
|
if check_digest(file_name, hash_algorithm, expected_digest):
|
|
return True
|
|
else:
|
|
print('Hash digest check failed in file {0}.'.format(file_name))
|
|
return False
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) > 1:
|
|
_, pkg_name, pkg_version = sys.argv
|
|
download_package(pkg_name, pkg_version)
|
|
return
|
|
|
|
pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
|
|
results = []
|
|
is_canceled = Event()
|
|
|
|
def handler(signum, frame):
|
|
print('Ctrl+C was pressed.')
|
|
is_canceled.set()
|
|
|
|
signal.signal(signal.SIGINT, handler)
|
|
|
|
for requirements_file in REQUIREMENTS_FILES:
|
|
# If the package name and version are not specified in the command line arguments,
|
|
# download the packages that in requirements.txt.
|
|
# requirements.txt follows the standard pip grammar.
|
|
for line in open(requirements_file):
|
|
# A hash symbol ("#") represents a comment that should be ignored.
|
|
line = line.split("#")[0]
|
|
# A semi colon (";") specifies some additional condition for when the package
|
|
# should be installed (for example a specific OS). We can ignore this and download
|
|
# the package anyways because the installation script(bootstrap_virtualenv.py) can
|
|
# take it into account.
|
|
first_token = line.split(";")[0].strip()
|
|
if not first_token:
|
|
continue
|
|
pkg_name, pkg_version = first_token.split('==')
|
|
results.append(pool.apply_async(
|
|
download_package, args=[pkg_name.strip(), pkg_version.strip()],
|
|
kwds={'is_canceled': is_canceled}))
|
|
|
|
for x in results:
|
|
x.get()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|