Source code for llvm_ir_dataset_utils.tools.get_swift_packages
"""Tool for getting Swift package list."""
import subprocess
import tempfile
import logging
import json
import os
import sys
from llvm_ir_dataset_utils.util import licenses
from absl import app
from absl import flags
import ray
FLAGS = flags.FLAGS
flags.DEFINE_string('package_list', 'swift_package_list.txt',
'The path to write the list of swift packages to.')
flags.DEFINE_string(
'gh_pat', None,
'Your github personal access token. Needed to query license information')
flags.DEFINE_boolean(
'source_ld', False,
'Whether or not to download the repositories that have not already been '
'tagged with license information and use go-license-detector to detect '
'license information')
flags.DEFINE_integer('max_projects', sys.maxsize,
'The maximum number of projects to process.')
flags.mark_flag_as_required('gh_pat')
REGISTRY_REPOSITORY = 'https://github.com/SwiftPackageIndex/PackageList'
# TODO(boomanaiden154): This and some of the code below can be refactored
# out into some common utilities as quite a bit is duplicated with
# get_julia_packages.py
@ray.remote(num_cpus=1)
def get_detected_license_repo_future(repo_url, repo_name):
return (repo_name,
licenses.get_detected_license_from_repo(repo_url, repo_name))
[docs]def main(_):
package_list = []
with tempfile.TemporaryDirectory() as download_dir:
registry_path = os.path.join(download_dir, 'registry')
registry_clone_vector = [
'git', 'clone', REGISTRY_REPOSITORY, '--depth=1', registry_path
]
logging.info('Cloning registry repository.')
subprocess.run(
registry_clone_vector,
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
logging.info('Processing registry.')
package_list_json_path = os.path.join(registry_path, 'packages.json')
with open(package_list_json_path) as package_list_json_file:
package_list = json.load(package_list_json_file)
package_list = package_list[:FLAGS.max_projects]
logging.info('Collecting license information from the Github API.')
sanitized_package_list = []
for package in package_list:
# We don't want the .git that is automatically at the end
sanitized_package_list.append(package[:-4])
repository_license_map = licenses.get_repository_licenses(
sanitized_package_list, FLAGS.gh_pat)
logging.info('Writing packages to list.')
output_package_list = []
for package in package_list:
current_package = {
'repo': package,
'name': package.split('/')[-1][:-4],
'license': repository_license_map[package[:-4]]
}
if repository_license_map[package[:-4]] != 'NOASSERTION':
current_package['license_source'] = 'github'
else:
current_package['license_source'] = None
output_package_list.append(current_package)
if FLAGS.source_ld:
logging.info('Gathering license information through license detection')
ray.init()
repo_license_futures = []
for package_dict in output_package_list:
if package_dict['license'] == 'NOASSERTION':
repo_license_futures.append(
get_detected_license_repo_future.remote(package_dict['repo'],
package_dict['name']))
detected_repo_name_license_map = {}
while len(repo_license_futures) > 0:
finished, repo_license_futures = ray.wait(
repo_license_futures, timeout=5.0)
logging.info(
f'Just got license information in {len(finished)} repos, {len(repo_license_futures)} remaining.'
)
repo_names_licenses = ray.get(finished)
for repo_name, repo_license in repo_names_licenses:
detected_repo_name_license_map[repo_name] = repo_license
for package_dict in output_package_list:
if package_dict['name'] in detected_repo_name_license_map:
package_dict['license'] = detected_repo_name_license_map[
package_dict['name']]
if package_dict['license'] != 'NOASSERTION':
package_dict['license_source'] = 'go_license_detector'
with open(FLAGS.package_list, 'w') as package_list_file:
json.dump(output_package_list, package_list_file, indent=2)
if __name__ == '__main__':
app.run(main)