User:Hjfocs/add dbpedia mapping.py

From Wikidata
Jump to navigation Jump to search
#!/usr/bin/env python
# -*- encoding: utf-8 -*-

import argparse
import logging
import pywikibot
import re
from collections import defaultdict
from datetime import date
from rdflib import Graph, OWL
from sys import exit

### --- BEGIN: constants --- ###
EQUIVALENT_CLASS = 'P1709'
EQUIVALENT_PROPERTY = 'P1628'
DESCRIBED_AT_URL = 'P973'
IMPORTED_FROM = 'P143'
RETRIEVED = 'P813'

DBPO_URL = 'http://mappings.dbpedia.org/server/ontology/dbpedia.owl'
DBPO_PREFIX = 'http://dbpedia.org/ontology/'
DESCRIPTION_PREFIX = 'http://mappings.dbpedia.org/index.php/Ontology'
DBPEDIA_ITEM = 'Q465'

SITE = pywikibot.Site("wikidata", "wikidata")
REPO = SITE.data_repository()

# The reference has a constant property
REFERENCE = pywikibot.Claim(REPO, IMPORTED_FROM)
DBPEDIA = pywikibot.ItemPage(REPO, DBPEDIA_ITEM)
REFERENCE.setTarget(DBPEDIA)

# Qualifiers have constant properties
DESCRIBED_AT_URL_QUALIFIER = pywikibot.Claim(REPO, DESCRIBED_AT_URL)
RETRIEVED_QUALIFIER = pywikibot.Claim(REPO, RETRIEVED)
### --- END: constants --- ###


def get_dbpedia_ontology(logger, url=DBPO_URL):
    """Parse the DBpedia ontology OWL into a graph from its URL"""
    logger.info('Parsing the DBpedia ontology OWL from [%s] ...' % url)
    parsed = Graph().parse(url)
    logger.info('Done! Got [%d] triples' % len(parsed))
    return parsed


def extract_wikidata_mappings(dbpo, logger):
    """Extract Wikidata-to-DBpedia mappings from the DBpedia ontology graph representation"""
    mappings = defaultdict(list)
    # Equivalent classes
    for dbpedia, equivalent in dbpo.subject_objects(OWL.equivalentClass):
        # Skip non-Wikidata equivalent classes
        if equivalent.find('wikidata') == -1:
            logger.debug('Skipping non-Wikidata mapping: %s' % {equivalent.toPython(): dbpedia.toPython()})
            continue
        # Remove URI namespaces (http://dbpedia.org/ontology/ and http://www.wikidata.org/entity/)
        mapping = {equivalent[31:]: dbpedia[28:]}
        mappings['classes'].append(mapping)
        logger.debug('Class mapping added: %s' % mapping)
    # Equivalent properties
    for dbpedia, equivalent in dbpo.subject_objects(OWL.equivalentProperty):
        # Skip non-Wikidata equivalent properties
        if equivalent.find('wikidata') == -1:
            logger.debug('Skipping non-Wikidata mapping: %s' % {equivalent.toPython(): dbpedia.toPython()})
            continue
        mapping = {equivalent[31:]: dbpedia[28:]}
        mappings['properties'].append(mapping)
        logger.debug('Property mapping added: %s' % mapping)
    return mappings


def add_all_claims(mappings, date_stamp, logger):
    """Add all the equivalency claims upon the mapping entries"""
    classes = mappings['classes']
    for pair in classes:
        for wikidata, dbpedia in pair.iteritems():
            add_equivalency_claim(wikidata, dbpedia, date_stamp, logger, item_type_from_dbpedia='class')
    properties = mappings['properties']
    for pair in properties:
        for wikidata, dbpedia in pair.iteritems():
            add_equivalency_claim(wikidata, dbpedia, date_stamp, logger, item_type_from_dbpedia='property')
    return 0


def today():
    """Generate a pywikibot.WbTime date stamp object to be used as value for the 'retrieved' qualifier"""
    td = date.today()
    return pywikibot.WbTime(site=REPO, year=td.year, month=td.month, day=td.day, precision='day')


def setup_logger(level):
    """Convenience function to setup logging capability"""
    levels = {'info': logging.INFO, 'warning': logging.WARNING, 'debug': logging.DEBUG}
    logger = logging.getLogger()
    logger.setLevel(levels[level])
    # Message format
    logFormatter = logging.Formatter("[%(levelname)-8.8s] %(module)s#%(funcName)s - %(message)s")
    # Log to console
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)
    logger.addHandler(consoleHandler)
    return logger


def check_type(item_id):
    """Check whether an Item is a class or a property (or none of them), given its ID"""
    # TODO more robust class checking via the existence of an instance-of property
    if item_id.startswith('Q'):
        return 'class'
    # Trivial to check if it's a property
    elif item_id.startswith('P'):
        return 'property'
    else:
        return None


def add_qualifiers(claim, item_type, dbpedia_ontology_item, date_stamp, logger):
    """
    Add the following 2 qualifiers to the given claim:
    1.  property = 'described at URL' (P973);
        value = the URL of the description of the DBpedia ontology item;
    2.  property = 'retrieved' (P813);
        value = date stamp corresponding to when the mapping was retrieved (i.e., when the claim was added)
    """
    if item_type == 'class':
        DESCRIBED_AT_URL_QUALIFIER.setTarget(DESCRIPTION_PREFIX + 'Class:' + dbpedia_ontology_item)
    elif item_type == 'property':
        DESCRIBED_AT_URL_QUALIFIER.setTarget(DESCRIPTION_PREFIX + 'Property:' + dbpedia_ontology_item)
    # Nothing to do, code execution should never reach this condition
    else:
        return 1
    RETRIEVED_QUALIFIER.setTarget(date_stamp)
    claim.addQualifier(DESCRIBED_AT_URL_QUALIFIER)
    logger.debug('Qualifier added: %s' % DESCRIBED_AT_URL_QUALIFIER.toJSON())
    claim.addQualifier(RETRIEVED_QUALIFIER)
    logger.debug('Qualifier added: %s' % DESCRIBED_AT_URL_QUALIFIER.toJSON())
    return 0


def add_reference(claim, logger):
    """
    Add a reference to the given claim.
    Uses 'imported from' (P143) as property,
    and the DBpedia Item (Q465) as value.
    """
    claim.addSource(REFERENCE)
    logger.debug('Reference added: %s' % REFERENCE.toJSON())
    return 0
    
    
def add_equivalency_claim(item_id, dbpedia_ontology_item, date_stamp, logger, item_type_from_dbpedia=None):
    """
    Add an equivalency claim to an Item, given its ID.
    The claim maps to a DBpedia ontology item.
    See http://mappings.dbpedia.org/server/ontology/
    """
    logger.info('Processing Wikidata Item [%s] ...' % item_id)
    item = pywikibot.ItemPage(REPO, item_id)
    logger.debug('DBpedia thinks it is a %s. Double checking it...' % item_type_from_dbpedia.upper())
    item_type = check_type(item_id)
    if item_type == 'class':
        logger.debug('I think it is a CLASS.')
        if item_type == item_type_from_dbpedia:
            logger.debug('I agree with DBpedia!')
            logger.info("Adding the 'equivalent class' (%s) claim..." % EQUIVALENT_CLASS)
            claim = pywikibot.Claim(REPO, EQUIVALENT_CLASS)
        else:
            logger.debug('I disagree with DBpedia!')
            logger.warning('DBpedia thinks [%s] is a %s, I think it is a %s. No claims will be added.' % (item_id, item_type_from_dbpedia.upper(), item_type.upper()))
            return 1
    elif item_type == 'property':
        logger.info('I think it is a PROPERTY.')
        if item_type == item_type_from_dbpedia:
            logger.debug('I agree with DBpedia!')
            logger.info("Adding the 'equivalent property' (%s) claim..." % EQUIVALENT_PROPERTY)
            claim = pywikibot.Claim(REPO, EQUIVALENT_PROPERTY)
        else:
            logger.debug('I disagree with DBpedia!')
            logger.warning('DBpedia thinks [%s] is a %s, I think it is a %s. No claims will be added.' % (item_id, item_type_from_dbpedia.upper(), item_type.upper()))
            return 1
    # Nothing to do
    else:
        logger.warning('[%s] is neither a class nor a property. Skipping...' % item_id)
        claim = None
    if claim:
        claim.setTarget(DBPO_PREFIX + dbpedia_ontology_item)
        item.addClaim(claim)
        add_qualifiers(claim, item_type, dbpedia_ontology_item, date_stamp, logger)
        add_reference(claim, logger)
    return 0


def create_cli_parser():
    """Create the command line arguments parser with proper help"""
    parser = argparse.ArgumentParser(description='Add DBpedia ontology mappings to Wikidata classes and properties')
    parser.add_argument('-l', '--level', choices=['debug', 'warning'], default='info', help='Set log level. Allowed values are "debug" or "warning". Default is "info"')
    parser.add_argument('--dbpo-url', help='Specify an alternative URL for the DBpedia ontology')
    parser.add_argument('--sandbox', action='store_true', help='Run the bot with test mappings over Wikidata sandbox items')
    return parser


def main(args):
    logger = setup_logger(args.level)
    date_stamp = today()
    if args.sandbox:
        logger.info("Running with test mappings over Wikidata sandbox items...")
        test_mappings = {'classes': [{'Q13406268': 'Espresso'}, {'Q13406268': 'Macchiato'}, {'Q4115189': 'Beer'}], 'properties': [{'Q4115189': 'Cheese'}, {'Q4115189': 'Wine'}, {'Q13406268': 'Corretto'}]}
        add_all_claims(test_mappings, date_stamp, logger)
    else:
        dbpo = get_dbpedia_ontology(logger, url=args.dbpo_url) if args.dbpo_url else get_dbpedia_ontology(logger)
        mappings = extract_wikidata_mappings(dbpo, logger)
        logger.info('A maximum of [%d] class and [%d] property mappings will be added. Total: [%d] (it may be less depending on the Item type double checking).' % (len(mappings['classes']), len(mappings['properties']), len(mappings['classes']) + len(mappings['properties'])))
        add_all_claims(mappings, date_stamp, logger)
    return 0


if __name__ == '__main__':
    cli = create_cli_parser()
    args = cli.parse_args()
    exit(main(args))