# SPDX-License-Identifier: GPL-3.0+
from __future__ import unicode_literals
import json
import xml.etree.ElementTree as ET
import neomodel
import estuary.utils.general as utils
from estuary import log
from estuary.models.distgit import DistGitCommit
from estuary.models.koji import ContainerKojiBuild, KojiBuild, ModuleKojiBuild
from estuary.models.user import User
from scrapers.base import BaseScraper
[docs]class KojiScraper(BaseScraper):
"""Scrapes the Koji tables in Teiid."""
[docs] def run(self, since=None, until=None):
"""
Run the Koji scraper.
:param str since: a datetime to start scraping data from
:param str until: a datetime to scrape data until
"""
log.info('Starting initial load for Koji')
# Initialize a start date from which all builds must be fetched
# If no input is given by the user, fetch builds from the past two years
if since is None:
start_date = self.default_since
else:
start_date = utils.timestamp_to_date(since)
if until is None:
end_date = self.default_until
else:
end_date = utils.timestamp_to_date(until)
builds = self.get_koji_builds(start_date, end_date)
log.info('Successfully fetched {0} builds from teiid'.format(len(builds)))
self.update_neo4j(builds)
log.info('Initial load of Koji builds complete!')
[docs] def get_koji_builds(self, start_date, end_date):
"""
Query Teiid for Koji builds.
:param datetime.datetime start_date: determines when to start the query
:param datetime.datetime end_date: determines until when to scrape data
:return: a list of dictionaries
:rtype: list
"""
# SQL query to fetch all builds from start date until now
log.info('Getting all Koji builds since {0} until {1}'.format(start_date, end_date))
sql_query = """
SELECT
brew.events.time as creation_time,
brew.build.completion_time,
brew.build.epoch,
brew.build.extra,
brew.build.id,
brew.users.name as owner_name,
brew.package.name as package_name,
brew.build.release,
brew.build.start_time,
brew.build.state,
brew.build.task_id,
brew.build.version
FROM brew.build
LEFT JOIN brew.events ON brew.build.create_event = brew.events.id
LEFT JOIN brew.package ON brew.build.pkg_id = brew.package.id
LEFT JOIN brew.users ON brew.build.owner = brew.users.id
WHERE brew.events.time IS NOT NULL AND brew.events.time >= '{0}'
AND brew.events.time <= '{1}'
ORDER BY brew.build.start_time DESC;
""".format(start_date, end_date)
return self.teiid.query(sql=sql_query)
[docs] def get_task(self, task_id):
"""
Query Teiid for a Koji task.
:param int task_id: the Koji task ID to query
:return: a list of dictionaries
:rtype: list
"""
# SQL query to fetch task related to a certain build
sql_query = """
SELECT arch, completion_time, create_time, id, "method", priority, request, start_time,
state, weight
FROM brew.task
WHERE id = {};
""".format(task_id)
return self.teiid.query(sql=sql_query)
[docs] def get_tag_info(self, tag_name):
"""
Query Teiid for tag_id of a tag and build_ids associated to it.
:param str tag_name: tag name
:return: a list of dictionaries
:rtype: list
"""
sql_query = """
SELECT tags.id AS tag_id, tag_listing.build_id
FROM brew.tag_listing AS tag_listing
LEFT JOIN brew.tag AS tags ON tag_listing.tag_id = tags.id
WHERE tag_listing.active = True AND tags.name = '{}';
""".format(tag_name)
return self.teiid.query(sql=sql_query)
[docs] def get_build_info(self, build_ids):
"""
Query Teiid for build info.
:param list build_ids: ID's of Koji builds
:return: a list of dictionaries
:rtype: list
"""
if not build_ids:
return []
sql_query = """
SELECT
brew.events.time as creation_time,
brew.build.completion_time,
brew.build.epoch,
brew.build.extra,
brew.build.id,
brew.users.name as owner_name,
brew.package.name as package_name,
brew.build.release,
brew.build.start_time,
brew.build.state,
brew.build.task_id,
brew.build.version
FROM brew.build
LEFT JOIN brew.events ON brew.build.create_event = brew.events.id
LEFT JOIN brew.package ON brew.build.pkg_id = brew.package.id
LEFT JOIN brew.users ON build.owner = brew.users.id
"""
sql_query += 'WHERE build.id = ' + ' OR build.id = '.join(
[str(build_id) for build_id in build_ids])
return self.teiid.query(sql=sql_query)
[docs] def update_neo4j(self, builds):
"""
Update Neo4j with Koji build information from Teiid.
:param list builds: a list of dictionaries
"""
# Uploads builds data to their respective nodes
log.info('Beginning to upload data to Neo4j')
count = 0
for build_dict in builds:
build_params = dict(
id_=build_dict['id'],
epoch=build_dict['epoch'],
state=build_dict['state'],
creation_time=build_dict['creation_time'],
start_time=build_dict['start_time'],
completion_time=build_dict['completion_time'],
name=build_dict['package_name'],
version=build_dict['version'],
release=build_dict['release']
)
try:
extra_json = json.loads(build_dict['extra'])
except (ValueError, TypeError):
extra_json = {}
if self.is_container_build(build_dict):
build_params['operator'] = bool(
extra_json.get('typeinfo', {}).get('operator-manifests', {}).get('archive')
)
try:
build = ContainerKojiBuild.create_or_update(build_params)[0]
except neomodel.exceptions.ConstraintValidationFailed:
# This must have errantly been created as a KojiBuild instead of a
# ContainerKojiBuild, so let's fix that.
build = KojiBuild.nodes.get_or_none(id_=build_params['id_'])
if not build:
# If there was a constraint validation failure and the build isn't just the
# wrong label, then we can't recover.
raise
build.add_label(ContainerKojiBuild.__label__)
build = ContainerKojiBuild.create_or_update(build_params)[0]
elif self.is_module_build(build_dict):
module_extra_info = extra_json.get('typeinfo', {}).get('module')
try:
build_params['context'] = module_extra_info.get('context')
build_params['mbs_id'] = module_extra_info.get('module_build_service_id')
build_params['module_name'] = module_extra_info.get('name')
build_params['module_stream'] = module_extra_info.get('stream')
build_params['module_version'] = module_extra_info.get('version')
build = ModuleKojiBuild.create_or_update(build_params)[0]
except neomodel.exceptions.ConstraintValidationFailed:
# This must have errantly been created as a KojiBuild instead of a
# ModuleKojiBuild, so let's fix that.
build = KojiBuild.nodes.get_or_none(id_=build_params['id_'])
if not build:
# If there was a constraint validation failure and the build isn't just the
# wrong label, then we can't recover.
raise
build.add_label(ModuleKojiBuild.__label__)
build = ModuleKojiBuild.create_or_update(build_params)[0]
else:
build = KojiBuild.create_or_update(build_params)[0]
username = build_dict['owner_name']
user = User.get_or_create(dict(username=username))[0]
build.conditional_connect(build.owner, user)
if build.__label__ == ModuleKojiBuild.__label__:
module_build_tag_name = module_extra_info.get('content_koji_tag')
if module_build_tag_name:
module_components = self.get_tag_info(module_build_tag_name)
# Some modules don't have components
if module_components:
for item in module_components:
module_component = KojiBuild.get_or_create(dict(
id_=item['build_id']
))[0]
build.components.connect(module_component)
component_builds = self.get_build_info(
[item['build_id'] for item in module_components])
self.update_neo4j(component_builds)
count += 1
log.info('Uploaded {0} builds out of {1}'.format(count, len(builds)))
container_koji_task_id = extra_json.get('container_koji_task_id')
if build_dict['task_id']:
task_id = build_dict['task_id']
elif container_koji_task_id:
task_id = container_koji_task_id
else:
# Continue if the task_id is None
continue
# Getting task related to the current build
try:
task_dict = self.get_task(task_id)[0]
except IndexError:
continue
commit_hash = None
# Only look for the commit hash if the build is an RPM or container
if task_dict['method'] in ('build', 'buildContainer'):
xml_root = ET.fromstring(task_dict['request'])
for child in xml_root.iter('string'):
if child.text and child.text.startswith('git'):
commit_hash = child.text.rsplit('#', 1)[1]
break
if commit_hash:
commit = DistGitCommit.get_or_create(dict(hash_=commit_hash))[0]
build.conditional_connect(build.commit, commit)