Source code for oaebu_workflows.ucl_discovery_telescope.ucl_discovery_telescope

# Copyright 2023 Curtin University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Author: Aniek Roelofs, Keegan Smith


import logging
import os
from typing import List, Union
from concurrent.futures import ThreadPoolExecutor, as_completed

import pendulum
from airflow.hooks.base import BaseHook
from google.cloud.bigquery import SourceFormat, TimePartitioningType, WriteDisposition
from google.oauth2 import service_account
from apiclient import discovery

from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str
from observatory.api.client.model.dataset_release import DatasetRelease
from observatory.platform.api import make_observatory_api
from observatory.platform.airflow import AirflowConns
from observatory.platform.files import save_jsonl_gz, load_jsonl
from observatory.platform.gcs import gcs_blob_uri, gcs_upload_files, gcs_blob_name_from_path
from observatory.platform.bigquery import bq_load_table, bq_table_id, bq_create_dataset
from observatory.platform.utils.url_utils import retry_get_url
from observatory.platform.observatory_config import CloudWorkspace
from observatory.platform.files import add_partition_date
from observatory.platform.workflows.workflow import (
    Workflow,
    PartitionRelease,
    cleanup,
    set_task_state,
    check_workflow_inputs,
)


[docs]class UclDiscoveryRelease(PartitionRelease):
    def __init__(
        self,
        dag_id: str,
        run_id: str,
        data_interval_start: pendulum.DateTime,
        data_interval_end: pendulum.DateTime,
        partition_date: pendulum.DateTime,
    ):
        """Construct a UclDiscoveryRelease instance.

        :param dag_id: The ID of the DAG
        :param run_id: The Airflow run ID.
        :param data_interval_start: The start of the data interval.
        :param data_interval_end: The end of the data interval.
        :param partition_date: The partition date for this release.
        """
        super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date)
        self.data_interval_start = data_interval_start
        self.data_interval_end = data_interval_end

        self.download_country_path = os.path.join(self.download_folder, "ucl_discovery_country.jsonl.gz")
        self.download_totals_path = os.path.join(self.download_folder, "ucl_discovery_totals.jsonl.gz")
        self.transform_path = os.path.join(self.transform_folder, "ucl_discovery.jsonl.gz")


[docs]class UclDiscoveryTelescope(Workflow):
    """The UCL Discovery telescope."""

    def __init__(
        self,
        dag_id: str,
        cloud_workspace: CloudWorkspace,
        sheet_id: str,
        data_partner: Union[str, OaebuPartner] = "ucl_discovery",
        bq_dataset_description: str = "UCL Discovery dataset",
        bq_table_description: str = "UCL Discovery table",
        api_dataset_id: str = "ucl",
        observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API,
        oaebu_service_account_conn_id: str = "oaebu_service_account",
        max_threads: int = os.cpu_count() * 2,
        schedule: str = "0 0 4 * *",  # run on the 4th of every month
        start_date: pendulum.DateTime = pendulum.datetime(2015, 6, 1),
        catchup: bool = True,
        max_active_runs: int = 10,
    ):
        """Construct a UclDiscoveryTelescope instance.

        :param dag_id: The ID of the DAG
        :param cloud_workspace: The CloudWorkspace object for this DAG
        :param sheet_id:  The ID of the google sheet match eprint ID to ISBN13
        :param data_partner: The name of the data partner
        :param bq_dataset_description: Description for the BigQuery dataset
        :param bq_table_description: Description for the biguery table
        :param api_dataset_id: The ID to store the dataset release in the API
        :param observatory_api_conn_id: Airflow connection ID for the overvatory API
        :param oaebu_service_account_conn_id: Airflow connection ID for the oaebu service account
        :param max_threads: The maximum number threads to utilise for parallel processes
        :param schedule: The schedule interval of the DAG
        :param start_date: The start date of the DAG
        :param catchup: Whether to catchup the DAG or not
        :param max_active_runs: The maximum number of concurrent DAG runs
        """
        super().__init__(
            dag_id,
            start_date,
            schedule,
            catchup=catchup,
            max_active_runs=max_active_runs,
            airflow_conns=[observatory_api_conn_id, oaebu_service_account_conn_id],
            tags=["oaebu"],
        )

        self.dag_id = dag_id
        self.cloud_workspace = cloud_workspace
        self.sheet_id = sheet_id
        self.data_partner = partner_from_str(data_partner)
        self.bq_dataset_description = bq_dataset_description
        self.bq_table_description = bq_table_description
        self.api_dataset_id = api_dataset_id
        self.oaebu_service_account_conn_id = oaebu_service_account_conn_id
        self.max_threads = max_threads
        self.observatory_api_conn_id = observatory_api_conn_id

        check_workflow_inputs(self)

        self.add_setup_task(self.check_dependencies)
        self.add_task(self.download)
        self.add_task(self.upload_downloaded)
        self.add_task(self.transform)
        self.add_task(self.upload_transformed)
        self.add_task(self.bq_load)
        self.add_task(self.add_new_dataset_releases)
        self.add_task(self.cleanup)

[docs]    def make_release(self, **kwargs) -> List[UclDiscoveryRelease]:
        """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is
        called in 'task_callable'. There will only be 1 release, but it is passed on as a list so the
        SnapshotTelescope template methods can be used.

        :param kwargs: the context passed from the PythonOperator.
        See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed
        :return: A list with one ucldiscovery release instance.
        """
        data_interval_start = kwargs["data_interval_start"].start_of("month")
        data_interval_end = kwargs["data_interval_end"].start_of("month")
        partition_date = data_interval_start.end_of("month")
        run_id = kwargs["run_id"]

        logging.info(
            f"Interval Start: {data_interval_start}, Interval End:{data_interval_end}, Partition date: {partition_date}, Run ID: {run_id}"
        )
        return UclDiscoveryRelease(
            self.dag_id,
            kwargs["run_id"],
            data_interval_start=data_interval_start,
            data_interval_end=data_interval_end,
            partition_date=partition_date,
        )

[docs]    def download(self, release: UclDiscoveryRelease, **kwargs):
        """Fownload the ucl discovery data for a given release.
        :param releases: The UCL discovery release.
        """
        mappings = get_isbn_eprint_mappings(self.sheet_id, self.oaebu_service_account_conn_id, release.partition_date)

        with ThreadPoolExecutor(self.max_threads) as executor:
            futures = []
            for eprint_id in mappings.keys():
                future = executor.submit(
                    download_discovery_stats, eprint_id, release.data_interval_start, release.partition_date
                )
                futures.append(future)
            totals = []
            country = []
            for future in as_completed(futures):
                c, t = future.result()
                country.append(c)
                totals.append(t)

        logging.info(f"Saving totals data to file: {release.download_totals_path}")
        save_jsonl_gz(release.download_totals_path, totals)
        logging.info(f"Saving country data to file: {release.download_country_path}")
        save_jsonl_gz(release.download_country_path, country)

[docs]    def upload_downloaded(self, release: UclDiscoveryRelease, **kwargs):
        """Uploads the downloaded files to GCS"""
        success = gcs_upload_files(
            bucket_name=self.cloud_workspace.download_bucket,
            file_paths=[release.download_country_path, release.download_totals_path],
        )
        set_task_state(success, kwargs["ti"].task_id, release=release)

[docs]    def transform(self, release: UclDiscoveryRelease, **kwargs):
        """Transform the ucl discovery data for a given release."""
        mappings = get_isbn_eprint_mappings(self.sheet_id, self.oaebu_service_account_conn_id, release.partition_date)

        # Load the records and sort them by eprint id
        country_records = load_jsonl(release.download_country_path)
        totals_records = load_jsonl(release.download_totals_path)
        country_records = sorted(country_records, key=lambda x: x["set"]["value"])  # ["set"]["value"] = eprint_id
        totals_records = sorted(totals_records, key=lambda x: x["set"]["value"])
        assert len(country_records) == len(totals_records), f"{len(country_records)} != {len(totals_records)}"

        with ThreadPoolExecutor(self.max_threads) as executor:
            futures = []
            for country_record, totals_record in zip(country_records, totals_records):
                isbn = mappings[country_record["set"]["value"]]["ISBN13"]
                title = mappings[country_record["set"]["value"]]["title"]
                future = executor.submit(transform_discovery_stats, country_record, totals_record, isbn, title)
                futures.append(future)
            results = []
            for future in as_completed(futures):
                results.append(future.result())

        # Add the release date to the data as a parition field
        results = add_partition_date(
            results, release.partition_date, TimePartitioningType.MONTH, partition_field="release_date"
        )
        print(results)
        save_jsonl_gz(release.transform_path, results)

[docs]    def upload_transformed(self, release: UclDiscoveryRelease, **kwargs):
        """Uploads the transformed file to GCS"""
        success = gcs_upload_files(
            bucket_name=self.cloud_workspace.transform_bucket, file_paths=[release.transform_path]
        )
        set_task_state(success, kwargs["ti"].task_id, release=release)

[docs]    def bq_load(self, release: UclDiscoveryRelease, **kwargs) -> None:
        """Loads the transformed data into BigQuery"""
        bq_create_dataset(
            project_id=self.cloud_workspace.project_id,
            dataset_id=self.data_partner.bq_dataset_id,
            location=self.cloud_workspace.data_location,
            description=self.bq_dataset_description,
        )

        uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path))
        table_id = bq_table_id(
            self.cloud_workspace.project_id, self.data_partner.bq_dataset_id, self.data_partner.bq_table_name
        )
        state = bq_load_table(
            uri=uri,
            table_id=table_id,
            schema_file_path=self.data_partner.schema_path,
            source_format=SourceFormat.NEWLINE_DELIMITED_JSON,
            partition_type=TimePartitioningType.MONTH,
            partition=True,
            partition_field="release_date",
            write_disposition=WriteDisposition.WRITE_APPEND,
            table_description=self.bq_table_description,
            ignore_unknown_values=True,
        )
        set_task_state(state, kwargs["ti"].task_id, release=release)

[docs]    def add_new_dataset_releases(self, release: UclDiscoveryRelease, **kwargs) -> None:
        """Adds release information to API."""
        api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id)
        dataset_release = DatasetRelease(
            dag_id=self.dag_id,
            dataset_id=self.api_dataset_id,
            dag_run_id=release.run_id,
            data_interval_start=kwargs["data_interval_start"],
            data_interval_end=kwargs["data_interval_end"],
            partition_date=release.partition_date,
        )
        api.post_dataset_release(dataset_release)

[docs]    def cleanup(self, release: UclDiscoveryRelease, **kwargs) -> None:
        """Delete all files, folders and XComs associated with this release."""
        cleanup(dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder)


[docs]def get_isbn_eprint_mappings(sheet_id: str, service_account_conn_id: str, cutoff_date: pendulum.DateTime) -> dict:
    """Get the eprint id to isbn mapping from the google sheet

    :param sheet_id: The ID of the google sheet.
    :param credentials: The credentials object to authenticate with.
    :param cutoff_date: The cutoff date. If an item is published after this date, it will be skipped.
    """
    scopes = [
        "https://www.googleapis.com/auth/drive",
        "https://www.googleapis.com/auth/drive.file",
        "https://www.googleapis.com/auth/spreadsheets",
    ]
    service_account_conn = BaseHook.get_connection(service_account_conn_id)
    credentials = service_account.Credentials.from_service_account_info(
        service_account_conn.extra_dejson, scopes=scopes
    )
    service = discovery.build("sheets", "v4", credentials=credentials)
    result = service.spreadsheets().values().get(spreadsheetId=sheet_id, range="isbn_mapping").execute()
    sheet_contents = result.get("values")
    if not sheet_contents:
        raise ValueError(f"No content found for sheet with ID {sheet_id}")

    items = []
    header = sheet_contents[0]
    if not all(heading in header for heading in ["ISBN13", "discovery_eprintid", "date", "title_list_title"]):
        raise ValueError(f"Invalid header found for sheet: {header}")
    for row in sheet_contents[1:]:
        items.append(dict(zip(header, row)))

    mappings = {}
    for item in items:
        eprint_id = item.get("discovery_eprintid")
        isbn = item.get("ISBN13")
        title = item.get("title_list_title")
        if not eprint_id or not isbn:
            logging.warn(f"Item with missing information will be skipped: {item}")
            continue
        if pendulum.parse(item["date"]) > cutoff_date:
            logging.info(f"Item released after cutoff date will be skipped: {item}")
            continue
        mappings[eprint_id] = {"ISBN13": isbn, "title": title}

    return mappings


[docs]def download_discovery_stats(eprint_id: str, start_date: pendulum.DateTime, end_date: pendulum.DateTime):
    """Downloads the discovery stats for a given eprint ID within a specified date range.

    :param eprint_id: The eprint ID of the item to get the stats for.
    :param start_date: The start date of the date range.
    :param end_date: The end date of the date range.
    :return: A tuple containing the country statistics and the total downloads statistics.
    """
    countries_url = (
        "https://discovery.ucl.ac.uk/cgi/stats/get"
        f"?from={start_date.format('YYYYMMDD')}&to={end_date.format('YYYYMMDD')}"
        f"&irs2report=eprint&set_name=eprint&set_value={eprint_id}&datatype=countries&top=countries"
        "&view=Table&limit=all&export=JSON"
    )
    totals_url = (
        "https://discovery.ucl.ac.uk/cgi/stats/get"
        f"?from={start_date.format('YYYYMMDD')}&to={end_date.format('YYYYMMDD')}"
        f"&irs2report=eprint&set_name=eprint&set_value={eprint_id}&datatype=downloads&graph_type=column"
        "&view=Google%3A%3AGraph&date_resolution=month&title=Download+activity+-+last+12+months&export=JSON"
    )
    response = retry_get_url(countries_url)
    country = response.json()
    response = retry_get_url(totals_url)
    totals = response.json()

    # Perform some checks on the returned data
    timescale = (start_date.format("YYYYMMDD"), end_date.format("YYYYMMDD"))
    country_timescale = (country["timescale"]["from"], country["timescale"]["to"])
    totals_timescale = (totals["timescale"]["from"], totals["timescale"]["to"])
    if country_timescale != timescale or totals_timescale != timescale:
        raise ValueError(
            f"Invalid timescale value(s): country: {country['timescale']} | totals: {totals['timescale']} != {timescale}"
        )
    if country["set"]["value"] != eprint_id or totals["set"]["value"] != eprint_id:
        raise ValueError(
            f"Invalid eprint ID values downloaded: {totals['set']['value']} | {country['set']['value']} != {eprint_id}"
        )

    return country, totals


[docs]def transform_discovery_stats(country_record: dict, totals_record: dict, isbn: str, title: str) -> dict:
    """Transforms the discovery stats for a single set of records

    :param country_record: The country record
    :param totals_record: The totals record
    :param isbn: The isbn that matches the eprint id
    :return: The transformed stats
    """
    # Sanity check the records
    country_eprint_id = country_record["set"]["value"]
    totals_eprint_id = totals_record["set"]["value"]
    if country_eprint_id != totals_eprint_id:
        raise ValueError(f"Country and totals eprint ID do not match: {country_eprint_id} != {totals_eprint_id}")

    country_timescale = country_record["timescale"]
    totals_timescale = totals_record["timescale"]
    if country_timescale != totals_timescale:
        raise ValueError(f"Timescales do not match: {country_timescale} != {totals_timescale}")

    # If there are no downloads for the time period, there is no "records" field in country stats
    country_records = country_record.get("records", [])

    transformed = {
        "ISBN": isbn,
        "title": title,
        "eprint_id": totals_record["set"]["value"],
        "timescale": totals_record["timescale"],
        "origin": totals_record["origin"],
        "total_downloads": totals_record["records"][0]["count"],
        "country": country_records,
    }
    return transformed