User:JVbot/wikipedia-sync.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This script watches a group of Wikipedia pages for Wikidata and Wikipedia issues
"""
#
# (C) John Vandenberg, 2014
#
# Distributed under the terms of the MIT license.
import pywikibot
from pywikibot import pagegenerators
import time
class WikiDataBase:
"""
Monitors a closely related set of Wikipedia pages using Wikidata.
Continually polls for changes to support frequently updating events (e.g. sports championships).
"""
item = None # base item to monitor
items = None # all items processed
unlinked_pages = None
category_matrix = None
category_only = False
first_pass_completed = False
def __init__(self, page, category_only = False):
"""
Arguments:
* event - page of the event to monitor
"""
self.item = page
self.items = {}
self.unlinked_pages = {}
self.category_matrix = {}
self.category_only = category_only
def item_wikipedia_languages(self, item):
sites = item.sitelinks.keys()
sites.sort()
wikipedias = []
for site in sites:
if site.endswith('wiki') and site != 'commonswiki':
if site[0:-4] == 'no':
wikipedias.append('nb')
else:
wikipedias.append(site[0:-4])
return wikipedias
def process_category_page(self, category_page, parents=True, recurse=True, articles=True):
#pywikibot.output('processing (category) %s' % (category_page.title(forceInterwiki=True)) )
item = self.process_page(category_page)
if item:
key = item.title()
lang = category_page.site.lang
if item.title() in self.category_matrix:
if category_page.site.lang in self.category_matrix[key]:
return
else:
self.category_matrix[key][lang] = []
else:
self.category_matrix[key] = {lang: [] }
if 'P31' in item.claims and item.claims['P31'][0].getTarget().title() == 'Q15647814':
return
if recurse:
for subcat in category_page.subcategories():
subcat_item = self.process_category_page(subcat)
if item and subcat_item:
self.category_matrix[key][lang].append(subcat_item.title())
if parents:
for parent_cat in category_page.categories():
self.process_page(parent_cat, cats=parents)
if articles:
# get all items for all pages in category
for page in category_page.articles():
page_item = self.process_page(page) #,cats=parents)
if item and page_item:
self.category_matrix[key][lang].append(page_item.title())
return item
def process_page(self, page, cats=False):
if page.site.lang == 'fr' and page.title().endswith("mars aux Jeux paralympiques d'hiver de 2014"):
return
elif (page.site.lang == 'no' or page.site.lang == 'nb') and ' under Paralympiske ' in page.title() and not ('2014' in page.title() or '2010' in page.title()):
return
elif page.title().startswith('Template:2014 Winter Paralympics wheelchair curling'):
return
#pywikibot.output('processing (page) %s' % (page.title(forceInterwiki=True)) )
page_item = page.data_item()
if not page_item.exists():
key = page.title(asLink=True,forceInterwiki=True).replace('[[','').replace(']]','')
if key not in self.unlinked_pages:
pywikibot.output('%s does not exist in Wikidata' % key)
self.unlinked_pages[key] = page
if self.first_pass_completed:
self.find_unlinked_matches(page)
return
page_item = self.process_item(page_item, page=page)
if cats:
for cat in page.categories():
if not cat.exists():
key = cat.title(asLink=True,forceInterwiki=True).replace('[[','').replace(']]','')
if key not in self.unlinked_pages:
pywikibot.output('%s on %s doesnt exist at all' % (key,page.title()) )
self.unlinked_pages[key] = cat
else:
self.process_page(cat, cats=False)
return page_item
# page must be one of the sitelinks; it can be any of them
# it is used to determine the type of page in this item, which is presumed to be the same for all sitelinks
def process_item(self, item, page=None):
quiet = False
if item.title() in self.items:
return self.items[item.title()]
item.get()
if not item.exists():
return
if 'en' in item.labels:
item_label = item.labels['en']
elif len(item.labels):
label_lang = item.labels.keys()[0]
item_label = item.labels[label_lang] + u' (' + label_lang + u')'
else:
item_label = u'NO LABEL IN ANY LANGUAGE!!'
item_label = item_label + u' (' + item.title() + u')'
if 'en' not in item.labels:
pywikibot.output('%s does not have an English label' % item_label)
if 'P31' not in item.claims:
if page and page.isCategory():
pywikibot.output('%s should have an instance of: Q15647814 (admin) or Q4167836 (content)' % item_label )
else:
pywikibot.output('%s should have an instance of' % item_label )
elif 'P31' in item.claims and item.claims['P31'][0].getTarget().title() == 'Q15647814':
quiet = True
if not quiet:
if len(item.sitelinks.keys()) == 1 and item.sitelinks.keys()[0] != 'enwiki':
pywikibot.output('%s only exists on %s' % (item_label,item.sitelinks.keys()[0]) )
elif 'enwiki' not in item.sitelinks.keys():
pywikibot.output('%s exists on %d wikis but not enwiki' % (item_label,len(item.sitelinks.keys()) ) )
self.items[item.title()] = item
return item
def process_item_pages(self, item, wikipedias = None):
item.get()
if not wikipedias:
wikipedias = self.item_wikipedia_languages(item)
for lang in wikipedias:
if lang == 'nb':
pagename = item.sitelinks['nowiki']
else:
pagename = item.sitelinks[lang+'wiki']
page = pywikibot.Page( pywikibot.Site(lang,'wikipedia') , pagename)
self.process_page(page, cats=True)
def process_category_item(self, category_item, wikipedias = None, recurse = True , parents = True, articles=True ):
category_item.get()
if not wikipedias:
wikipedias = self.item_wikipedia_languages(category_item)
for lang in wikipedias:
if lang == 'nb':
pagename = category_item.sitelinks['nowiki']
else:
pagename = category_item.sitelinks[lang+'wiki']
if not self.first_pass_completed:
pywikibot.output('Finding pages on %s.wikipedia' % lang)
category_page = pywikibot.Category( pywikibot.Site(lang.replace('_','-'),'wikipedia') , pagename)
self.process_category_page(category_page)
def processWLH(self):
wikipedias = self.item_wikipedia_languages(self.item)
for lang in wikipedias:
pagename = self.item.sitelinks[lang+'wiki']
if not self.first_pass_completed:
pywikibot.output('Finding linked pages on %s.wikipedia' % lang)
main_page = pywikibot.Page( pywikibot.Site(lang,'wikipedia') , pagename)
gen = pagegenerators.ReferringPageGenerator(main_page)
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces=[0])
for page in gen:
if self.process_page(page, cats=False):
pywikibot.output('%s is related but not in the category tree' % page.title())
def find_unlinked_matches(self, page):
page_title = page.title()
if ' (' in page_title:
page_title = page_title.split(' (')[0]
count = 0
gen = pagegenerators.SearchPageGenerator(page_title, namespaces=[0], total=11, site=self.item.site)
for item in gen:
if count == 10:
pywikibot.output(u'.. and more results exist')
break
count = count + 1
item = pywikibot.ItemPage(self.item.site, item.title())
item.get()
if 'en' in item.labels:
item_label = item.labels['en']
elif len(item.labels) == 0:
item_label = '<none>'
else:
item_label = item.labels[item.labels.keys()[0]]
pywikibot.output(u'%s might be %s (%s)' % (page.title(asLink=True,forceInterwiki=True).replace('[[','').replace(']]',''), item_label, item.title() ))
def find_all_unlinked_matches(self):
pywikibot.output('---- Now looking for matches for all unlinked pages ----')
for key, page in self.unlinked_pages.iteritems():
self.find_unlinked_matches(page)
pywikibot.output('---- Finished looking for matches for all unlinked pages ----')
def find_missing_cats(self):
# this isnt the most efficient of algorithms.
pywikibot.output('---- Now looking for categories to populate ----')
for category_qid in self.category_matrix:
category_data = self.category_matrix[category_qid]
category_item = self.items[category_qid]
category_langs = self.item_wikipedia_languages(category_item)
all_contents = []
for lang in category_langs:
if lang == 'no':
lang = 'nb'
if lang not in category_data:
pywikibot.output(u'%s lang %s wasnt processed for articles, or category is empty' % (category_qid,lang))
continue
all_contents += category_data[lang]
all_contents = set(all_contents)
all_lang_data = {}
for page_qid in all_contents:
page_item = self.items[page_qid]
for lang in self.item_wikipedia_languages(page_item):
if lang == 'no':
lang = 'nb'
if lang not in all_lang_data:
all_lang_data[lang] = [page_qid]
else:
all_lang_data[lang].append(page_qid)
for lang in all_lang_data.keys():
if lang in category_data:
if len(category_data[lang]) == len(all_lang_data[lang]):
del all_lang_data[lang]
else:
if len(all_lang_data[lang]) == 1:
#pywikibot.output(u'%s in new lang %s has only one possible item: %s; skipping' % (category_qid, lang, all_lang_data[lang][0]) )
del all_lang_data[lang]
if not len(all_lang_data):
continue
label_lang = None
if 'en' in category_item.labels:
label_lang = 'en'
elif len(category_item.labels):
label_lang = sorted(category_item.labels.keys())[0]
pywikibot.output(u'%s (%s) improvements:' % (category_qid, category_item.labels[label_lang]) )
new_wikipedias = set(all_lang_data.keys()) - set(category_data.keys())
modified_wikipedias = set(all_lang_data.keys()) - new_wikipedias
pywikibot.output(u' existing category %s additions on wikis %s:' % (category_qid, u','.join(modified_wikipedias) ) )
for lang in modified_wikipedias:
if lang == 'nb':
slang = 'nowiki'
else:
slang = lang+'wiki'
msg_prefix = u' %s:%s ' % (lang,category_item.sitelinks[slang])
category_lang_missing = set(all_lang_data[lang]) - set(category_data[lang])
pywikibot.output(msg_prefix + u'is missing the following pages: %s' % (','.join(category_lang_missing)))
new_wikipedias_with_labels = new_wikipedias & (set(category_item.labels.keys()) - set(category_langs))
if not new_wikipedias_with_labels:
pywikibot.output(u' potential new categories for %s on wikis %s (none have labels):' % (category_qid, u','.join(new_wikipedias) ) )
else:
pywikibot.output(u' potential new categories for %s on wikis %s:' % (category_qid, u','.join(new_wikipedias) ) )
new_wikipedias = list(new_wikipedias_with_labels) + list(set(new_wikipedias) - set(new_wikipedias_with_labels))
for lang in new_wikipedias:
msg_prefix = u' '
if lang in category_item.labels:
msg_prefix += u'%s:%s ' % (lang,category_item.labels[lang])
else:
msg_prefix += u'%s:?? ' % (lang)
if lang == 'no':
lang = 'nb'
pywikibot.output(msg_prefix + u'should be created with items: %s' % (','.join(all_lang_data[lang])))
def refresh_unlinked(self):
found = []
for key, page in self.unlinked_pages.iteritems():
page_item = page.data_item()
if page_item.exists():
pywikibot.output(u'%s now exists in Wikidata as %s; processing..' % (key, page_item.title()) )
found.append(key)
self.process_page(page)
for key in found:
del self.unlinked_pages[key]
def refresh(self):
"""
Populates the database.
"""
self.item.get()
self.process_item(self.item)
is_category_item = 'P31' in self.item.claims and self.item.claims['P31'][0].getTarget().title() == 'Q4167836'
category_item = None
if 'P910' in self.item.claims:
category_item = self.item.claims['P910'][0].getTarget()
elif is_category_item:
category_item = self.item
if self.first_pass_completed:
self.refresh_unlinked()
if not is_category_item:
pywikibot.output('---- Processing the pages linked to this Q ----')
self.process_item_pages(self.item)
if category_item:
pywikibot.output('---- Now looking at category (%s) on all wikipedias ----' % category_item.title())
self.process_category_item(category_item)
if not category_item or (not is_category_item and not self.category_only):
pywikibot.output('---- Now looking at what links here ----')
self.processWLH()
def run(self,sleep=None):
while True:
self.refresh()
if not self.first_pass_completed:
self.find_all_unlinked_matches()
self.find_missing_cats()
self.first_pass_completed = True
if sleep:
pywikibot.output(u"Sleeping for %d minutes ..." % sleep )
time.sleep(sleep*60)
def main():
args = pywikibot.handleArgs()
qid = None
site = pywikibot.getSite()
datasite = site.data_repository()
poll = True
articles = True
parents = False
gen = pagegenerators.GeneratorFactory()
for arg in args:
# Handle page generator args
if gen.handleArg(arg):
continue
elif arg.startswith('-qid:'):
qid = int(arg[len('-qid:'):])
elif arg == '-once':
poll = False
elif arg == '-structure':
articles = False
elif arg == '-parents':
articles = True
else:
raise Exception('Unknown command line option')
if not qid:
raise Exception('Need a -qid parameter')
return
item = pywikibot.ItemPage(datasite, 'Q'+str(qid))
item.get()
generator = gen.getCombinedGenerator()
db = WikiDataBase(item,category_only=True)
if generator:
for page in generator:
db.process_page(page, cats=True)
else:
try:
if not poll:
db.refresh()
db.find_all_unlinked_matches()
db.find_missing_cats()
else:
db.run(1)
except KeyboardInterrupt:
pass
if __name__ == "__main__":
main()