User:Edoderoobot/Olympedia
Jump to navigation
Jump to search
import pywikibot
import sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import date
from pywikibot import pagegenerators as pg
#from datetime import datetime
from dateutil import parser
site = pywikibot.Site('wikidata', 'wikidata')
repo = site.data_repository()
biodata = '<table class=\'biodata\'>'
Pweight = 'P2067'
Plength = 'P2048'
numbers = '01234567890'
characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
space = ' '
m = 'Q11573'
cm = 'Q174728'
kg = 'Q11570'
g = 'Q41803'
lng = 'nl'
countries = {'ALB': 'Q222', 'AND': 'Q228', 'ARG': 'Q414', 'ARM': 'Q399', 'AUS': 'Q408', 'GER': 'Q183', 'ETH': 'Q115', 'CUB': 'Q241', 'CAN': 'Q16', 'BLR': 'Q184', 'USA': 'Q30', '': '', 'ISL': 'Q189', 'JPN': 'Q17', 'SWE': 'Q34',
'GBR': 'Q145', 'KOR': 'Q884', 'NOR': 'Q20', 'BEL': 'Q31', 'HUN': 'Q28', 'EGY': 'Q79', 'NZL': 'Q664', 'ITA': 'Q38', 'MOZ': 'Q1029', 'CRO': 'Q224', 'DEN': 'Q35', 'IRQ': 'Q796', 'IRI': 'Q794', 'LTU': 'Q37', 'CZE': 'Q213',
'TCH': 'Q33946', 'RSA': 'Q258', 'ROU': 'Q218', 'RUS': 'Q159', 'SLO': 'Q215', 'CHN': 'Q148', 'FRA': 'Q142', 'FIN': 'Q33', 'BRA': 'Q155', 'FRG': 'Q183', 'SUI': 'Q39', 'POR': 'Q45', 'IRL': 'Q27', 'NED': 'Q29999',
'ISR': 'Q801', 'IND': 'Q668', 'GUA': 'Q774', 'MEX': 'Q96', 'MYA': 'Q836', 'KSA': 'Q851', 'CRC': 'Q800', 'SVK': 'Q214', 'POL': 'Q36', 'TUR': 'Q43', 'GEQ': 'Q983', 'NGR': 'Q1033', 'LAT': 'Q211', 'GEO': 'Q230',
'UKR': 'Q212', 'ESP': 'Q29', 'GAB': 'Q1000', 'PUR': 'Q30', 'AUT': 'Q40', 'URS': 'Q15180', 'PAK': 'Q843', 'BUL': 'Q219', 'COL': 'Q739', 'THA': 'Q869', 'GDR': 'Q16957', 'URU': 'Q77', 'VEN': 'Q717', 'JAM': 'Q766',
'LCA': 'Q760', 'SUD': 'Q1049', 'ZIM': 'Q954', 'EST': 'Q191', 'CHI': 'Q298', 'ALG': 'Q262', 'GRE': 'Q41', 'ANG': 'Q916', 'GUM': 'Q16635', 'LUX': 'Q32', 'BOL': 'Q750', 'KEN': 'Q114', 'MDV': 'Q826', 'MRI': 'Q1027',
'PER': 'Q419', 'HKG': 'Q148', 'VIE': 'Q881', 'ROC': 'Q159', 'PRK': 'Q423', 'MAR': 'Q1028', 'UGA': 'Q1036', 'MNE': 'Q236', 'GRN': 'Q769', 'BUR': 'Q965', 'MAS': 'Q833', 'HON': 'Q783', 'BAH': 'Q778', 'BRN': 'Q398',
'PLE': 'Q219060', 'MDA': 'Q217', 'ESA': 'Q792', 'FIJ': 'Q712', 'LBR': 'Q1014', 'LBN': 'Q822', 'UZB': 'Q265', 'KGZ': 'Q813', 'KAZ': 'Q232', 'GHA': 'Q117', 'SRB': 'Q403', 'SLE': 'Q1044', 'MLT': 'Q233', 'AZE': 'Q227',
'TPE': 'Q865', 'YUG': 'Q36704', 'BIH': 'Q225', 'VNM': 'Q881', 'BOH': 'Q39193', 'TJK': 'Q863', 'ATH': 'Q844930', 'LIE': 'Q347', 'SCG': 'Q37024', 'PHI': 'Q928', 'SRI': 'Q854', 'CIV': 'Q1008', 'NEP': 'Q837', 'ZAM': 'Q953',
'ARU': 'Q21203', 'LAO': 'Q819', 'UAE': 'Q878', 'GBS': 'Q1007', 'BAN': 'Q902', 'CAM': 'Q424', 'YEM': 'Q805', 'SEY': 'Q1042', 'SYR': 'Q858', 'SKN': 'Q763', 'IVB': 'Q145', 'RWA': 'Q1037', 'SEN': 'Q1041', 'QAT': 'Q846',
'BER': 'Q23635', 'CMR': 'Q1009', 'ISV': 'Q11703', 'SAM': 'Q683', 'CYP': 'Q229', 'TUN': 'Q948', 'KIR': 'Q710', 'EUN': 'Q159', 'MAW': 'Q1020', 'MLI': 'Q912', 'ECU': 'Q736', 'NAM': 'Q1030', 'IOA': 'Q574', 'SUR': 'Q730',
'ERI': 'Q986', 'MAD': 'Q1019', 'JOR': 'Q810', 'VAN': 'Q686', 'SAA': 'Q183', 'SPA': 'Q5690', 'MGL': 'Q711', 'TLS': 'Q574', 'RHO': 'Q217169', 'PAR': 'Q733', 'ANT': 'Q781', 'COK': 'Q26988', 'SGP': 'Q334', 'SMR': 'Q238',
'CAF': 'Q929', 'GAM': 'Q1005', 'KUW': 'Q817', 'ANZ': 'Q408', 'BDI': 'Q967', 'BOT': 'Q963', 'CGO': 'Q971', 'DOM': 'Q786', 'AFG': 'Q889', 'TTO': 'Q754', 'LBA': 'Q1016', 'NCA': 'Q811', 'HAI': 'Q790', 'PNG': 'Q691',
'TAN': 'Q924', 'TKM': 'Q874', 'GUY': 'Q734', 'BAR': 'Q244', 'FSM': 'Q702', 'KOS': 'Q1246', 'PLW': 'Q695', 'TOG': 'Q945', 'VIN': 'Q757', 'STP': 'Q45', 'ASA': 'Q30', 'BIZ': 'Q242', 'MON': 'Q235', 'INA': 'Q252',
'MAK': 'Q83958', 'SWZ': 'Q1050', 'PAN': 'Q804', 'BRU': 'Q921', 'COD': 'Q974', 'BEN': 'Q962', 'OMA': 'Q842', 'CHA': 'Q657', 'TUV': 'Q672', 'TGA': 'Q678', 'AHO': 'Q29999','LES': 'Q1013', 'NRU': 'Q697',
'GUI': 'Q1006', 'BHU': 'Q917', 'SOL': 'Q685', 'DMA': 'Q784', 'SOM': 'Q1045', 'DJI': 'Q977', 'MKD': 'Q221', 'CAY': 'Q145', 'UAR': 'Q79', 'CPV': 'Q1011', 'MHL': 'Q709', 'EOR': 'Q958', 'COM': 'Q970',
'NIG': 'Q1032', 'COR': 'Q884','EPH': 'Q1747689', 'MAL': 'Q833', 'ROM': 'Q1747689', 'YMD': 'Q199841', 'WIF': 'Q754', 'YAR': 'Q267584', 'MTN': 'Q1025'}
def wd_sparql_query(spq):
wikidatasite = pywikibot.Site('wikidata', 'wikidata')
generator = pg.WikidataSPARQLPageGenerator(spq, site=wikidatasite)
for wd in generator:
if (wd.exists()):
wd.get(get_redirect=True)
yield wd
def parseWeightLength(string):
try:
string += ' x x x '
weight = length = 0
wstr = lstr = ''
w_unit = l_unit = rw_unit = rlunit = ''
i = 0
while string[i] in numbers:
lstr += string[i]
i += 1
while string[i] in [space]:
i += 1
# print('lstr',lstr,i)
while string[i] not in [' ', '/', characters]:
l_unit += string[i]
i += 1
# print('l_unit',l_unit,i)
while string[i] in [space, ' ', '/', '-']:
i += 1
# print('x',i)
while string[i] in numbers:
wstr += string[i]
i += 1
# print('wstr',wstr)
while string[i] in [space]:
i += 1
# print('z',i)
while ((i < len(string)) & (string[i] not in [' ', '/', characters])):
w_unit += string[i]
if (i < len(string)):
i += 1
# print('wunit',i,w_unit,len(string))
if l_unit == 'cm':
rl_unit = cm
elif l_unit == 'm':
rl_unit = m
else:
return(0, '', 0, '')
if w_unit == 'g':
rw_unit = g
elif w_unit == 'kg':
rw_unit = kg
else:
return(0, '', 0, '')
while string[i] in numbers:
lstr = lstr+string[i]
#print('[%s][%s]-[%s][%s]' % (wstr,w_unit,lstr,l_unit))
if wstr == '':
wstr = '0'
if lstr == '':
lstr = '0'
return (int(wstr), rw_unit, int(lstr), rl_unit)
except:
return null, null, null, null
def makesrc():
source_claim = pywikibot.Claim(repo, 'P248', is_reference=True)
source_claim.setTarget(pywikibot.ItemPage(repo, 'Q95606922'))
return(source_claim)
def newClaim(wd, P, value, unit, summary):
# target=pywikibot.WbQuantity(value,pywikibot.ItemPage(repo,unit),0.1,site=site)
target = pywikibot.WbQuantity(
value, pywikibot.ItemPage(repo, unit), site=site)
claim = pywikibot.Claim(repo, P)
claim.setTarget(target)
claim.addSources([makesrc()])
wd.addClaim(claim, summary=summary)
def simpleway(wd, table):
row = 0
hdr = [td.get_text() for td in table.findAll('th')]
for rowdata in table.findAll("tr"):
cells = rowdata.findAll("td")
for x in range(0, len(cells)):
if (hdr[row] == 'Measurements'):
weight, wunit, length, lunit = parseWeightLength(
cells[x].find(text=True))
# print(weight,wunit,length,lunit)
if (not Pweight in wd.claims) and (weight > 30):
newClaim(wd, Pweight, weight, wunit,
'add weight from Olympedia')
if (not Plength in wd.claims) and (length > 90):
newClaim(wd, Plength, length, lunit,
'add length from Olympedia')
elif (hdr[row] == 'Type'):
pass
elif (hdr[row] == 'Full name'):
fullname = cells[x].find(text=True).replace('•', ' ')
updateOneAlias(wd, lng, fullname)
elif (hdr[row] == 'Used name'):
usedname = cells[x].find(text=True).replace('•', ' ')
updateOneAlias(wd, lng, usedname)
elif (hdr[row] == 'Other names'):
othername = cells[x].find(text=True).replace('•', ' ')
updateOneAlias(wd, lng, othername)
elif (hdr[row] == 'Original name'):
originalname = cells[x].find(text=True).replace('•', ' ')
updateOneAlias(wd, lng, originalname)
elif (hdr[row] == 'Nick/petnames'):
for nick in cells[x].find(text=True).split(','):
addNickName(wd, nick)
elif (hdr[row] == 'NOC'):
countryname = cells[x].find()
if (not('P27' in wd.claims)):
addCountry(wd, countryname)
else:
listCountry(wd, countryname)
elif (hdr[row] == 'Affiliations'):
pass
elif (hdr[row] == 'Died'):
if (not 'P570' in wd.claims):
DateClaim(wd, 'P570', cells[x].find(text=True))
elif (hdr[row] == 'Born'):
if (not 'P569' in wd.claims):
DateClaim(wd, 'P569', cells[x].find(text=True))
elif (hdr[row] == 'Sex'):
if (not('P21') in wd.claims):
sexe = cells[x].find(text=True)
if (sexe == 'Female'):
addFemale(wd)
elif (sexe == 'Male'):
addMale(wd)
else:
print(hdr[row], '––', cells[x].find(text=True))
row += 1
def addMale(wd):
print('addMale')
target=pywikibot.ItemPage(repo,'Q6581097')
claim=pywikibot.Claim(repo,'P21')
claim.setTarget(target)
claim.addSources([makesrc()])
wd.addClaim(claim,summary='add sexe from Olympedia')
def addFemale(wd):
print('addFemale')
target=pywikibot.ItemPage(repo,'Q6581072')
claim=pywikibot.Claim(repo,'P21')
claim.setTarget(target)
claim.addSources([makesrc()])
wd.addClaim(claim,summary='add sexe from Olympedia')
def addCountry(wd, countryname):
claim = pywikibot.Claim(repo, 'P27')
cntr = countryname.get('src').replace(
'/images/flags/', '').replace('.png', '')
print(cntr)
if (cntr in countries):
target = pywikibot.ItemPage(repo, countries[cntr])
claim.setTarget(target)
claim.addSources([makesrc()])
wd.addClaim(claim, summary='add nationality from Olympedia')
else:
print('missing %1 in countries' % cntr)
def updateOneAlias(wd, lng, alias):
if lng in wd.labels:
if wd.labels[lng] == alias:
return # little need to add label as alias
wd.get(get_redirect=True)
newalias = []
if (lng in wd.aliases):
newalias = wd.aliases[lng]
if not(alias in wd.aliases[lng]):
newalias.append(alias)
else:
return
else:
newalias.append(alias)
wd.editEntity({'aliases': {lng: newalias}},
summary=f'---add from Olympedia alias for {lng}')
print('Add alias %s' % alias)
def addNickName(wd, nickname):
if ('P1449' in wd.claims):
return
for claim in wd.claims['P1449']:
if claim.getTarget().text == nickname:
return
claim = pywikibot.Claim(repo, 'P1449')
target = pywikibot.WbMonolingualText(text=nickname, language='en')
claim.setTarget(target)
claim.addSources([makesrc()])
wd.addClaim(claim, summary='Nickname from Olympedia')
def Olympedia(wd):
try:
f = urlopen('https://www.olympedia.org/athletes/%s' %
wd.claims['P8286'][0].getTarget())
except:
return
htmltext = f.read().decode('utf-8')
soup = BeautifulSoup(htmltext)
table = soup.find("table", attrs={"class": "biodata"})
simpleway(wd, table)
def allOlympedians():
#for wd in wd_sparql_query('select ?item where {?item wdt:P8286 ?o; wdt:P27 ?l. ?item wdt:P106 wd:Q11513337; wdt:P27 wd:Q29999} '):
for wd in wd_sparql_query('select ?item where {?item wdt:P8286 ?o; wdt:P27 ?l}'):
print('wd: ', wd.title())
Olympedia(wd)
def listCountry(wd, countryname):
try:
cntr = countryname.get('src').replace('/images/flags/', '').replace('.png', '')
except:
cntr = None
if (cntr!=None) and (not(cntr in countries)):
if 'P27' in wd.claims:
if len(wd.claims['P27']) == 1:
country = wd.claims['P27'][0].getTarget().title()
countries.update({cntr: country})
print(countries) # copy-paste into source code
def DateClaim(wd, P, DateStr):
try:
print('Begin')
date = parser.parse(DateStr.replace(' in ', ''))
print('OK-!')
except:
print('unknown date format, skipped: ', DateStr)
return # no valid date extracted
target = pywikibot.WbTime(date.year, date.month, date.day)
claim = pywikibot.Claim(repo, P)
claim.setTarget(target)
claim.addSources([makesrc()])
wd.addClaim(claim, summary='date from Olympedia')
#DateClaim('','P570','15 August 1980')
#item=pywikibot.ItemPage(repo,'Q31295074')
#Olympedia(item)
allOlympedians()