User:Edoderoobot/Create wikidata items
Jump to navigation
Jump to search
#2019-09-28 0923
"""
todo
2) avoid brackets in label, move bracket text to description
3) get birthdate etc from infobox
6) start parameters (language, generator: newpages/category/etc)
7) recognise sportseasons
8) recognise uitvoerend artist P175
9) show when no cats have suggestive properties
flowchart on https://drive.google.com/file/d/1xJdGvp4FrLLJj8e3-GILIkF5eZf8obkH/view?usp=sharing
"""
import pywikibot
from pywikibot import pagegenerators as pg
import datetime
import time
from pywikibot.data import api
import sys, getopt
import json
import requests
language='nl'
checklabels=['en','de','fr','nl','es','pt','dk','se','ru']
metaclass='Q19361238'
max_new_pages=99525
hours=24
allowed_namespaces = [0]
professionconvert={'Q30185':'Q82955', 'Q4657217':'Q82955'}
countryconvert={'Q55':'Q29999'}
throttleBIG=9000
throttleONE=90
throttleDISAMB=4499
person='Q5'
p31convert={'Q5':person,'Q19660746':person,'Q215627':person,'Q30905655':person}
canbeP31=[person,'Q19660746','Q215627','Q30905655','Q16521','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','']
canbeP17=['Q515','Q16970','Q847017','Q13406463','Q12280','Q123705','Q4167410','Q15383322','Q618779','Q194356','Q1286517','Q23442','Q39715','Q27020041','Q178844','Q860861','Q12039431']
canbeP495=['Q11424','Q17329259','Q1580166','Q5398426','Q215380','Q571','Q3305213','Q5185279','Q15416','Q191067','Q7889','Q47461344','Q134556','Q7725634','Q482994','Q20667187','Q506240','Q11032','Q1144661','Q1983062','Q1785271',
'Q2743','Q4830453','Q8436','Q41298','Q7302866','Q3240003','Q2831984','','','','','','','']
neglect=['Q3957','Q3624078']
sexe={'Q6581097':'male','Q6581072':'female'}
labellng=['nl','de','fr','en','pt','es','it','da','se','pl','hr','cs','sk','eo','hu','fy','nn','no']
updatelabelfor=[person,'Q4830453','Q6881511']
sportset=['Q31629','Q349']
months=['Q108','Q109','Q110','Q118','Q119','Q120','Q121','Q122','Q123','Q124','Q125','Q126',]
birthdate_infobox={'nl':'geboortedatum','en':'birth_date','de':'Geburtsdatum'}
monthnamecache=[]
commit=True
printcat=False
maxdeepcat=5
class Property:
def __init__(self):
self.P=None
self.cat=None
self.level=9999
class wdSuggest:
def __init__(self):
self.wd=None
self.pagename=None
self.isa=Property()
self.sexe=Property()
self.profession=Property()
self.adminEntity=Property()
self.sport=Property()
self.country=Property()
self.memberofpolitics=Property()
self.tijdstip=Property()
self.performer=Property()
self.searchresult=None
self.value=0
self.prospect=None
self.topcat=None
def suggestPage(self,page,create=True,addproperties=True):
if page.text[0:9].lower()=='#redirect': return
if page.text[0:15].lower()=='#doorverwijzing': return
print(f'Page: {page.title()}')
self=wdSuggest()
self.pagename=page.title()
self.aggregateProperties(page)
try:
#self.wd=page.data_item();
self.wd=pywikibot.ItemPage.fromPage(page)
self.wd.get(get_redirect=True)
self.printitem(extratxt='already has wd-item')
if addproperties:
if (self.addProperties()>0):
print('properties added')
except:
self.wd=None
self.printitem(extratxt='no wd-item yet')
if 'disambiguation' in page.properties():
print('Search 4 disambiguation')
self.findExisting(page.title())
if self.isa.P==None:
self.isa.P='Q4167410'
self.isa.cat=None
self.isa.level=9999
else:
self.findExisting(page.title())
if (self.searchresult==None) and create:
self.newItem(page.title())
else:
self.evaluateResults(addproperties=addproperties)
def allpropertiesfilled(self):
if (self.topcat):
if (printcat): print('All the way!')
return(True)
if (self.isa.P and self.sexe.P and self.profession.P and self.adminEntity.P and self.memberofpolitics.P and self.sport.P and self.country.P and self.sport):
if (printcat): print('Fully filled')
return(True)
return(False)
def walksubcats(self,cat,level=1,usedcats=[]):
def checkcat(str):
start=str.find(':')
if start>0:
return(str[start+1:].find(':')<0)
return(False)
if (checkcat(cat.title())) and (level<maxdeepcat):
self.findfromcat(cat,True,level)
usedcats.append(cat.title())
for x in cat.categories():
if (printcat): print(f'subc: {x.title()}*-*level={level}')
if not(x.title() in usedcats) and (checkcat(x.title())):
self.walksubcats(x,level=level+1,usedcats=usedcats)
#print(f'Now level: {level} - {usedcats}')
def aggregateProperties(self,page):
apcats=[]
if (printcat): print('xxx')
for cat in page.categories():
if (printcat): print(f'page: {cat.title()}')
self.findfromcat(cat,False,0)
#while not(self.allpropertiesfilled()):
if (printcat): print('Not found yet')
for cat in page.categories():
if printcat: print(f'redo: {cat.title()}')
self.walksubcats(cat,level=1,usedcats=apcats)
if (self.allpropertiesfilled()):
if (printcat): print('Fullfilled')
break
#self.topcat=True
if (printcat): print(f'Used cats: {apcats}')
def compareProperty(self,wdx,P,factor):
if (not(P in wdx.claims)):
return(1.0)
if (self.wd!=None):
if (P in self.wd.claims) and (P in wdx.claims):
for c in self.wd.claims[P]:
for cc in wdx.claims[P]:
if c.getTarget()==cc.getTarget():
return(5*factor) #full hit
return factor #hmmm
return(0.1*factor) #unlikely
else:
for cc in wdx.claims[P]:
cct=cc.getTarget().title()
if P=='P31':
if self.isa.P==cct:
return(5*factor)
if P=='P641':
if (self.sport.P==cct):
return(5*factor)
if P=='P102':
if (self.memberofpolitics.P==cct):
return(5*factor)
if (P=='P21'):
if (self.sexe.P==cct):
return(2*factor)
else:
return(factor)
if (P=='P17'):
if (self.country.P==cct):
return(5*factor)
if (P=='P27'):
if (self.country.P==cct):
return(5*factor)
if P=='P106':
if (self.profession.P==cct):
return(5*factor)
return(0.1*factor)
def Compare(self,wdx):
uselabel=''
if (language in wdx.labels):
uselabel=wdx.labels[language]
else:
for tlang in checklabels:
if tlang in wdx.labels:
uselabel=wdx.labels[tlang]
if (uselabel==self.pagename):
cval=100
else:
cval=1
cval=cval*self.compareProperty(wdx,'P31',9.0)
cval=cval*self.compareProperty(wdx,'P106',5.0)
cval=cval*self.compareProperty(wdx,'P27',4.0)
cval=cval*self.compareProperty(wdx,'P102',2.0)
cval=cval*self.compareProperty(wdx,'P641',2.0)
cval=cval*self.compareProperty(wdx,'P21',2.0)
return cval
def evaluateResults(self,addproperties=True):
for xresult in self.searchresult:
wdx=pywikibot.ItemPage(repo,xresult)
wdx.get(get_redirect=True)
value=self.Compare(wdx)
print(f'--{value}-{xresult}')
if (((value>throttleBIG) and (len(self.searchresult)>1)) or ((value>throttleONE) and (len(self.searchresult)==1))) and (value>self.value) or ((self.isa.P=='Q4167410') and (value>throttleDISAMB)):
self.value=value
self.prospect=xresult
if (self.prospect!=None):
print(f'Use {self.prospect}')
self.attachPage(self.pagename)
if addproperties:
self.addProperties()
def setLabels(self,pagename):
label=pagename #later strip ()
pos=label.find('(')
if pos>2:
label=label[:pos-1]
if ((not(language in labellng)) or (not(self.isa.P in updatelabelfor))):
self.wd.editLabels({language:label}) #exceptional language not using Latin script, add separately to enforce a label, or label not suitable for multi-language update
else:
lbldata={'labels':{}}
for lng in labellng:
if (not(lng in self.wd.labels)):
lbldata['labels'].update({lng:label})
if lbldata['labels']!={}:
self.wd.editEntity(lbldata,summary=f'set multiple labels')
def attachPage(self,pagename):
if (self.wd==None):
self.wd=pywikibot.ItemPage(repo,self.prospect)
self.wd.get(get_redirect=True)
self.setLabels(pagename)
#print('Now set sitelink!')
if (not(language+'wiki') in self.wd.sitelinks):
self.wd.setSitelink(sitelink={'site':language+'wiki','title':pagename},summary='set link')
def hasPropertiesFilled(self):
return((self.isa.P!=None) or (self.sexe.P!=None) or (self.profession.P!=None) or (self.country.P!=None) or (self.memberofpolitics.P!=None) or (self.adminEntity.P!=None) or (self.sport.P!=None) or (self.tijdstip.P!=None))
def printitem(self,extratxt=''):
#print(f'Item: {extratxt}')
if (True):
if self.isa.P != None: print(f'Is een..: {self.isa.P}')
if self.sexe.P!=None: print(f'Sexe....: {sexe[self.sexe.P]}')
if self.profession.P!=None: print(f'Beroep..: {self.profession.P}')
if self.sport.P!=None: print(f'Sport...: {self.sport.P}')
if self.adminEntity.P!=None: print(f'Gemeente: {self.adminEntity.P}')
if self.country.P!=None: print(f'Land....: {self.country.P}')
if self.memberofpolitics.P!=None: print(f'Partij..: {self.memberofpolitics.P}')
if self.searchresult!=None: print(f'Kan.....: {self.searchresult}')
def checkTopic(self,wdt,prop,target,sourcecat,catlevel):
#print(f'prop: {prop}-cat: {sourcecat}')
if (prop in wdt.claims):
#print(f'{prop} found')
for isaclaim in wdt.claims[prop]:
xt=isaclaim.getTarget()
if (xt):
isa=xt.title()
if isa in ['Q6256']:
if (target in countryconvert) and (self.isPerson()) and ((self.country.P==None) or (catlevel<self.country.level)):
self.country.P=countryconvert[target]
else:
self.country.P=target
self.country.cat=sourcecat
self.country.level=catlevel
if ((target in canbeP31) or (target in canbeP17) or (target in canbeP495)) and ((self.isa.P==None) or (catlevel<self.isa.level)):
if target in p31convert:
self.isa.P=p31convert[target]
else:
self.isa.P=target
self.isa.cat=sourcecat
self.isa.level=catlevel
if (isa in ['Q515']) and ((self.adminEntity.P==None) or (catlevel<self.adminEntity.level)):
self.adminEntity.P=target
self.adminEntity.cat=sourcecat
self.adminEntity.level=catlevel
if (isa in ['Q28640']) and ((self.profession.P==None) or (catlevel<self.profession.level)):
self.profession.cat=sourcecat
if target in professionconvert:
self.profession.P=professionconvert[target]
else:
self.profession.P=target
self.profession.level=catlevel
if (isa in ['Q7278']) and ((self.memberofpolitics.P==None) or (catlevel<self.memberofpolitics.level)):
self.memberofpolitics.P=target
self.memberofpolitics.cat=sourcecat
self.memberofpolitics.level=catlevel
if (isa in ['Q3186692']) and (self.tijdstip.P==None):
try:
wbt=pywikibot.ItemPage(repo,target)
wbt.get(get_redirect=True)
if ('P585') in wbt.claims:
self.tijdstip.P=wbt.claims['P585'].getTarget()
except:
pass
self.tijdstip.cat=sourcecat
self.tijdstip.level=catlevel
if (target in sexe) and (self.sexe.P==None):
self.sexe.P=target
self.sexe.cat=sourcecat
if (isa in sportset) and (catlevel<self.sport.level):
self.sport.P=target
self.sport.cat=sourcecat
self.sport.level=catlevel
def findExisting(self,search_for): #find suspects to connect page to, based on name
def getItems(site, itemtitle):
params = { 'action' :'wbsearchentities' , 'format' : 'json' , 'language' : 'en', 'type' : 'item', 'search': itemtitle}
request = api.Request(site=site,parameters=params)
return request.submit()
ssite = pywikibot.Site("wikidata", "wikidata")
srepo = site.data_repository()
searchresult = getItems(ssite, search_for)
result=[]
for item in searchresult['search']:
itemID=item['id']
wd=pywikibot.ItemPage(srepo,item['id'])
wd.get(get_redirect=True)
result.append(wd.title())
if (result==[]) or (self.wd!=None):
self.searchresult=None
else:
self.searchresult=result
def newItem(self,title): #create a new item, set label-link-properties
#print('Might create')
if (self.searchresult==None) and (self.hasPropertiesFilled()):
print('Will create')
self.wd=repo.editEntity({},{},summary='#cwifna')
self.wd=pywikibot.ItemPage(repo,self.wd['entity']['id'])
self.wd.get()
self.attachPage(title)
return self.addProperties()
def isPerson(self):
if self.wd!=None:
if 'P31' in self.wd.claims:
for x in self.wd.claims['P31']:
if x!=None:
if x.getTarget()==person: return(True)
return(self.isa.P==person)
def addOneProperty(self,P,V,summary='add claim',onlyaddnew=True):
#print(f'Add {V} to {P}')
if (P=='P17') and (not(self.isa.P in canbeP17)):
if (not(self.isa.P in canbeP495)) and (self.isa.P):
print(f'No P17 for {self.isa.P}')
return(0)
if (P=='P495') and (not(self.isa.P in canbeP495)):
if (not(self.isa.P in canbeP17)) and (self.isa.P):
print(f'No P495 for {self.isa.P}')
return(0)
if (not((P in self.wd.claims) and onlyaddnew)) and (V!=None) and (not(V in neglect)):
claim=pywikibot.Claim(repo,P)
target=pywikibot.ItemPage(repo,V)
claim.setTarget(target)
if commit: self.wd.addClaim(claim,summary=summary)
return(1)
return(0)
def addProperties(self): #add properties to the wd-item
added=0
#print('Try to add properties')
if self.wd==None: return
#print('Will add properties')
if (self.isPerson()):
#print('It is a person')
added+=self.addOneProperty('P106',self.profession.P,summary=f'P106 from categorie {self.profession.cat}')
if self.country.P in countryconvert: self.country.P=countryconvert[self.country.P]
added+=self.addOneProperty('P27',self.country.P,f'country from categorie {self.country.cat}')
added+=self.addOneProperty('P102',self.memberofpolitics.P,summary=f'add membership of political party from categorie {self.memberofpolitics.cat}')
added+=self.addOneProperty('P21',self.sexe.P,summary=f'P21 from categorie {self.sexe.cat}')
else:
#print('It is not a person')
added+=self.addOneProperty('P131',self.adminEntity.P,summary=f'add location from category {self.adminEntity.cat}')
added+=self.addOneProperty('P17',self.country.P,f'country from categorie {self.country.cat}')
added+=self.addOneProperty('P495',self.country.P,f'country of origin from categorie {self.country.cat}')
added+=self.addOneProperty('P641',self.sport.P,summary=f'add sport from category {self.sport.cat}')
added+=self.addOneProperty('P585',self.tijdstip.P,summary=f'add point in time from {self.tijdstip.cat}')
added+=self.addOneProperty('P31',self.isa.P,summary=f'P31 from categorie {self.isa.cat}')
self.profession.P=None
self.country.P=None
self.adminEntity.P=None
self.memberofpolitics.P=None
self.tijdstip.P=None
self.isa.P=None
self.sexe.P=None
return added
def findfromcat(self,cat,showmissing,level): #loop through categories, derives properties from wd-item from cat
#print(f'--{cat.title()}')
if 'wikibase_item' in cat.properties():
wdc=cat.data_item()
wdc.get(get_redirect=True)
#print(wdc.title())
for cl in ['P301','P971']:
if cl in wdc.claims:
for topic in wdc.claims[cl]:
if topic.getTarget()!=None:
target=topic.getTarget().title()
wdt=pywikibot.ItemPage(repo,target)
wdt.get(get_redirect=True)
self.checkTopic(wdt,'P31',target,wdc.title(),level)
self.checkTopic(wdt,'P279',target,wdc.title(),level)
else:
pass
#if (showmissing): print(f'Missing P971 on category: {wdc.title()}')
def getnewpages(site):
for page in pg.NewpagesPageGenerator(site,0,max_new_pages):
if (page.exists()): #avoid speedy deleted pages
dt=page.oldest_revision
timediff=dt.timestamp.today()-dt.timestamp
if (timediff<datetime.timedelta(hours/24)):
if (page.namespace().id in allowed_namespaces):
if page.exists():
yield(page)
else:
break
def check_cat(thiscat):
cat=pywikibot.Category(site,thiscat)
gen=pg.CategorizedPageGenerator(cat,99,start=None,total=None,namespaces=None,)
for page in gen:
if (page.namespace()==0):
yield(page)
def getnewpageswithoutwd(site):
for page in getnewpages(site):
if (page.namespace().id in allowed_namespaces):
if (page.exists()): #avoid speedy deleted pages
dt=page.oldest_revision
timediff=dt.timestamp.today()-dt.timestamp
if (timediff<datetime.timedelta(hours/24)):
if ('wikibase_item' in page.properties()):
pass
#print(f'passed {page.title()}')
else:
yield(page)
def all_links_from(page):
for x in page.linkedPages():
if (x.exists()):
print(x.title())
wds=wdSuggest()
wds.wd=pywikibot.ItemPage(pywikibot.Site('wikidata','wikidata').data_repository(),x.title())
wds.wd.get(get_redirect=True)
if (language+'wiki' in wds.wd.sitelinks):
page=pywikibot.Page(site,wds.wd.sitelinks[language+'wiki'])
wds.suggestPage(page)
site=pywikibot.Site(language,'wikipedia')
repo=site.data_repository()
def PagePyleGenerator(pile):
api_token = '?id=%s&action=get_data&format=json&doit'
api_url_base = 'https://tools.wmflabs.org/pagepile/api.php'
url=api_url_base+api_token%pile
data = requests.get(url)
#print(dir(data))
pile_lng =json.loads(data.text)['language'] #language
pile_prj =json.loads(data.text)['project'] #project
pile_items=json.loads(data.text)['pages_returned'] #nr of items
pile_total=json.loads(data.text)['pages_total'] #pages_total
plsite=pywikibot.Site(pile_lng,pile_prj)
plrepo=plsite.data_repository()
pyle=(json.loads(data.text)['pages'])
for oneitem in pyle:
if (pile_lng=='wikidata'):
plwd=pywikibot.ItemPage(plrepo,oneitem)
plwd.get(get_redirect=True)
yield(plwd)
else:
plpage=pywikibot.Page(plsite,oneitem)
yield(plpage)
def pageIDgenerator():
pageids=[
]
for x in pageids:
yield x
#""" #one category
if (True):
language='nl'
site=pywikibot.Site(language,'wikipedia')
#gen=check_cat('Bedrijf naar land')
#gen=PagePyleGenerator('26341') #en-wiki disambiguation
gen=getnewpageswithoutwd(site)
#gen=getnewpages(site)
for page in gen:
wds=wdSuggest()
wds.suggestPage(page)
#""" #one category
""" #pageID
if (True):
site=pywikibot.Site('nl','wikipedia')
#print('OK')
for page in pg.PagesFromPageidGenerator(pageIDgenerator(),site=site):
#print(page)
wds=wdSuggest()
wds.suggestPage(page)
""" #pageID
"""#small handmade list
if (True):
site=pywikibot.Site('da','wikipedia')
for x in [
#'',
]:
wds=wdSuggest()
page=pywikibot.Page(site,x)
#print(f'Will suggest {x}')
wds.suggestPage(page)
""" #small handmade list
""" #one single page
site=pywikibot.Site('nl','wikipedia')
page=pywikibot.Page(site,'Grendel Games')
wds=wdSuggest()
wds.suggestPage(page)
""" #one single page
""" #all without claims
page=pywikibot.Page(pywikibot.Site('wikidata','wikidata'),'Wikidata:Database reports/without claims by site/nlwiki')
all_links_from(page)
""" #all without claims
def main(argv):
try:
opts,args = getopt.getopt(argv,'gvhcl',['gen','verbose','help','cat','lang'])
print(opts)
print('-------')
print(args)
except getopt.GetoptError as err:
print(f'xxx: {err}')
if __name__ == "__main__":
print('Start')
main(sys.argv[1:])
print('Klaar')