Source code for llvm_ir_dataset_utils.tools.get_julia_packages
"""Tool for getting Julia packages."""
import glob
import subprocess
import tempfile
import os
import logging
import json
import sys
from llvm_ir_dataset_utils.util import licenses
from absl import app
from absl import flags
import toml
import ray
FLAGS = flags.FLAGS
flags.DEFINE_string('package_list', 'julia_package_list.json',
'The path to write all the list of Julia packages to.')
flags.DEFINE_string(
'gh_pat', None,
'Your Github personal access token. Needed to query license information.')
flags.DEFINE_boolean(
'source_ld', False,
'Whether or not to download the repositories that have not already been '
'tagged with license information and use go-license-detector to detect '
'license information')
flags.DEFINE_integer('max_projects', sys.maxsize,
'The max number of projects to process')
flags.mark_flag_as_required('gh_pat')
REGISTRY_REPOSITORY = 'https://github.com/JuliaRegistries/General'
@ray.remote(num_cpus=1)
def get_detected_license_repo_future(repo_url, repo_name):
return (repo_name,
licenses.get_detected_license_from_repo(repo_url, repo_name))
[docs]def main(_):
package_list = []
repository_url_list = []
with tempfile.TemporaryDirectory() as download_dir:
registry_path = os.path.join(download_dir, 'registry')
repository_clone_vector = [
'git', 'clone', REGISTRY_REPOSITORY, '--depth=1', registry_path
]
logging.info('Cloning registry repository.')
subprocess.run(
repository_clone_vector,
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
logging.info('Processing registry.')
for package_toml_path in glob.glob(
os.path.join(registry_path, '**/Package.toml'), recursive=True):
with open(package_toml_path) as package_toml_file:
package_description = toml.load(package_toml_file)
package_name = package_description['name']
package_repo = package_description['repo']
if 'jll' not in package_name:
package_list.append({'name': package_name, 'repo': package_repo})
# Omit the last four characters as julia includes .git by default
# in all their repository urls which we don't want.
repository_url_list.append(package_repo[:-4])
if len(package_list) >= FLAGS.max_projects:
break
logging.info('Gathering license information from the Github API.')
repo_license_map = licenses.get_repository_licenses(repository_url_list,
FLAGS.gh_pat)
for package_dict in package_list:
package_dict['license'] = repo_license_map[package_dict['repo'][:-4]]
if package_dict['license'] != 'NOASSERTION':
package_dict['license_source'] = 'github'
else:
package_dict['license_source'] = None
if FLAGS.source_ld:
logging.info('Gathering license information through license detection')
ray.init()
repo_license_futures = []
for package_dict in package_list:
if package_dict['license'] == 'NOASSERTION':
repo_license_futures.append(
get_detected_license_repo_future.remote(package_dict['repo'],
package_dict['name']))
detected_repo_name_license_map = {}
while len(repo_license_futures) > 0:
finished, repo_license_futures = ray.wait(
repo_license_futures, timeout=5.0)
logging.info(f'Just got license information on {len(finished)} repos, '
f'{len(repo_license_futures)} remaining.')
repo_names_licenses = ray.get(finished)
for repo_name, repo_license in repo_names_licenses:
detected_repo_name_license_map[repo_name] = repo_license
for package_dict in package_list:
if package_dict['name'] in detected_repo_name_license_map:
package_dict['license'] = detected_repo_name_license_map[
package_dict['name']]
if package_dict['license'] != 'NOASSERTION':
package_dict['license_source'] = 'go_license_detector'
logging.info('Writing packages to list.')
with open(FLAGS.package_list, 'w') as package_list_file:
json.dump(package_list, package_list_file, indent=2)
if __name__ == '__main__':
app.run(main)