User:Hjfocs/add dbpedia mapping.py
Jump to navigation
Jump to search
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import argparse
import logging
import pywikibot
import re
from collections import defaultdict
from datetime import date
from rdflib import Graph, OWL
from sys import exit
### --- BEGIN: constants --- ###
EQUIVALENT_CLASS = 'P1709'
EQUIVALENT_PROPERTY = 'P1628'
DESCRIBED_AT_URL = 'P973'
IMPORTED_FROM = 'P143'
RETRIEVED = 'P813'
DBPO_URL = 'http://mappings.dbpedia.org/server/ontology/dbpedia.owl'
DBPO_PREFIX = 'http://dbpedia.org/ontology/'
DESCRIPTION_PREFIX = 'http://mappings.dbpedia.org/index.php/Ontology'
DBPEDIA_ITEM = 'Q465'
SITE = pywikibot.Site("wikidata", "wikidata")
REPO = SITE.data_repository()
# The reference has a constant property
REFERENCE = pywikibot.Claim(REPO, IMPORTED_FROM)
DBPEDIA = pywikibot.ItemPage(REPO, DBPEDIA_ITEM)
REFERENCE.setTarget(DBPEDIA)
# Qualifiers have constant properties
DESCRIBED_AT_URL_QUALIFIER = pywikibot.Claim(REPO, DESCRIBED_AT_URL)
RETRIEVED_QUALIFIER = pywikibot.Claim(REPO, RETRIEVED)
### --- END: constants --- ###
def get_dbpedia_ontology(logger, url=DBPO_URL):
"""Parse the DBpedia ontology OWL into a graph from its URL"""
logger.info('Parsing the DBpedia ontology OWL from [%s] ...' % url)
parsed = Graph().parse(url)
logger.info('Done! Got [%d] triples' % len(parsed))
return parsed
def extract_wikidata_mappings(dbpo, logger):
"""Extract Wikidata-to-DBpedia mappings from the DBpedia ontology graph representation"""
mappings = defaultdict(list)
# Equivalent classes
for dbpedia, equivalent in dbpo.subject_objects(OWL.equivalentClass):
# Skip non-Wikidata equivalent classes
if equivalent.find('wikidata') == -1:
logger.debug('Skipping non-Wikidata mapping: %s' % {equivalent.toPython(): dbpedia.toPython()})
continue
# Remove URI namespaces (http://dbpedia.org/ontology/ and http://www.wikidata.org/entity/)
mapping = {equivalent[31:]: dbpedia[28:]}
mappings['classes'].append(mapping)
logger.debug('Class mapping added: %s' % mapping)
# Equivalent properties
for dbpedia, equivalent in dbpo.subject_objects(OWL.equivalentProperty):
# Skip non-Wikidata equivalent properties
if equivalent.find('wikidata') == -1:
logger.debug('Skipping non-Wikidata mapping: %s' % {equivalent.toPython(): dbpedia.toPython()})
continue
mapping = {equivalent[31:]: dbpedia[28:]}
mappings['properties'].append(mapping)
logger.debug('Property mapping added: %s' % mapping)
return mappings
def add_all_claims(mappings, date_stamp, logger):
"""Add all the equivalency claims upon the mapping entries"""
classes = mappings['classes']
for pair in classes:
for wikidata, dbpedia in pair.iteritems():
add_equivalency_claim(wikidata, dbpedia, date_stamp, logger, item_type_from_dbpedia='class')
properties = mappings['properties']
for pair in properties:
for wikidata, dbpedia in pair.iteritems():
add_equivalency_claim(wikidata, dbpedia, date_stamp, logger, item_type_from_dbpedia='property')
return 0
def today():
"""Generate a pywikibot.WbTime date stamp object to be used as value for the 'retrieved' qualifier"""
td = date.today()
return pywikibot.WbTime(site=REPO, year=td.year, month=td.month, day=td.day, precision='day')
def setup_logger(level):
"""Convenience function to setup logging capability"""
levels = {'info': logging.INFO, 'warning': logging.WARNING, 'debug': logging.DEBUG}
logger = logging.getLogger()
logger.setLevel(levels[level])
# Message format
logFormatter = logging.Formatter("[%(levelname)-8.8s] %(module)s#%(funcName)s - %(message)s")
# Log to console
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)
return logger
def check_type(item_id):
"""Check whether an Item is a class or a property (or none of them), given its ID"""
# TODO more robust class checking via the existence of an instance-of property
if item_id.startswith('Q'):
return 'class'
# Trivial to check if it's a property
elif item_id.startswith('P'):
return 'property'
else:
return None
def add_qualifiers(claim, item_type, dbpedia_ontology_item, date_stamp, logger):
"""
Add the following 2 qualifiers to the given claim:
1. property = 'described at URL' (P973);
value = the URL of the description of the DBpedia ontology item;
2. property = 'retrieved' (P813);
value = date stamp corresponding to when the mapping was retrieved (i.e., when the claim was added)
"""
if item_type == 'class':
DESCRIBED_AT_URL_QUALIFIER.setTarget(DESCRIPTION_PREFIX + 'Class:' + dbpedia_ontology_item)
elif item_type == 'property':
DESCRIBED_AT_URL_QUALIFIER.setTarget(DESCRIPTION_PREFIX + 'Property:' + dbpedia_ontology_item)
# Nothing to do, code execution should never reach this condition
else:
return 1
RETRIEVED_QUALIFIER.setTarget(date_stamp)
claim.addQualifier(DESCRIBED_AT_URL_QUALIFIER)
logger.debug('Qualifier added: %s' % DESCRIBED_AT_URL_QUALIFIER.toJSON())
claim.addQualifier(RETRIEVED_QUALIFIER)
logger.debug('Qualifier added: %s' % DESCRIBED_AT_URL_QUALIFIER.toJSON())
return 0
def add_reference(claim, logger):
"""
Add a reference to the given claim.
Uses 'imported from' (P143) as property,
and the DBpedia Item (Q465) as value.
"""
claim.addSource(REFERENCE)
logger.debug('Reference added: %s' % REFERENCE.toJSON())
return 0
def add_equivalency_claim(item_id, dbpedia_ontology_item, date_stamp, logger, item_type_from_dbpedia=None):
"""
Add an equivalency claim to an Item, given its ID.
The claim maps to a DBpedia ontology item.
See http://mappings.dbpedia.org/server/ontology/
"""
logger.info('Processing Wikidata Item [%s] ...' % item_id)
item = pywikibot.ItemPage(REPO, item_id)
logger.debug('DBpedia thinks it is a %s. Double checking it...' % item_type_from_dbpedia.upper())
item_type = check_type(item_id)
if item_type == 'class':
logger.debug('I think it is a CLASS.')
if item_type == item_type_from_dbpedia:
logger.debug('I agree with DBpedia!')
logger.info("Adding the 'equivalent class' (%s) claim..." % EQUIVALENT_CLASS)
claim = pywikibot.Claim(REPO, EQUIVALENT_CLASS)
else:
logger.debug('I disagree with DBpedia!')
logger.warning('DBpedia thinks [%s] is a %s, I think it is a %s. No claims will be added.' % (item_id, item_type_from_dbpedia.upper(), item_type.upper()))
return 1
elif item_type == 'property':
logger.info('I think it is a PROPERTY.')
if item_type == item_type_from_dbpedia:
logger.debug('I agree with DBpedia!')
logger.info("Adding the 'equivalent property' (%s) claim..." % EQUIVALENT_PROPERTY)
claim = pywikibot.Claim(REPO, EQUIVALENT_PROPERTY)
else:
logger.debug('I disagree with DBpedia!')
logger.warning('DBpedia thinks [%s] is a %s, I think it is a %s. No claims will be added.' % (item_id, item_type_from_dbpedia.upper(), item_type.upper()))
return 1
# Nothing to do
else:
logger.warning('[%s] is neither a class nor a property. Skipping...' % item_id)
claim = None
if claim:
claim.setTarget(DBPO_PREFIX + dbpedia_ontology_item)
item.addClaim(claim)
add_qualifiers(claim, item_type, dbpedia_ontology_item, date_stamp, logger)
add_reference(claim, logger)
return 0
def create_cli_parser():
"""Create the command line arguments parser with proper help"""
parser = argparse.ArgumentParser(description='Add DBpedia ontology mappings to Wikidata classes and properties')
parser.add_argument('-l', '--level', choices=['debug', 'warning'], default='info', help='Set log level. Allowed values are "debug" or "warning". Default is "info"')
parser.add_argument('--dbpo-url', help='Specify an alternative URL for the DBpedia ontology')
parser.add_argument('--sandbox', action='store_true', help='Run the bot with test mappings over Wikidata sandbox items')
return parser
def main(args):
logger = setup_logger(args.level)
date_stamp = today()
if args.sandbox:
logger.info("Running with test mappings over Wikidata sandbox items...")
test_mappings = {'classes': [{'Q13406268': 'Espresso'}, {'Q13406268': 'Macchiato'}, {'Q4115189': 'Beer'}], 'properties': [{'Q4115189': 'Cheese'}, {'Q4115189': 'Wine'}, {'Q13406268': 'Corretto'}]}
add_all_claims(test_mappings, date_stamp, logger)
else:
dbpo = get_dbpedia_ontology(logger, url=args.dbpo_url) if args.dbpo_url else get_dbpedia_ontology(logger)
mappings = extract_wikidata_mappings(dbpo, logger)
logger.info('A maximum of [%d] class and [%d] property mappings will be added. Total: [%d] (it may be less depending on the Item type double checking).' % (len(mappings['classes']), len(mappings['properties']), len(mappings['classes']) + len(mappings['properties'])))
add_all_claims(mappings, date_stamp, logger)
return 0
if __name__ == '__main__':
cli = create_cli_parser()
args = cli.parse_args()
exit(main(args))