User:Edoderoobot/Olympedia

From Wikidata
Jump to navigation Jump to search
import pywikibot
import sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import date
from pywikibot import pagegenerators as pg
#from datetime import datetime
from dateutil import parser

site = pywikibot.Site('wikidata', 'wikidata')
repo = site.data_repository()
biodata = '<table class=\'biodata\'>'

Pweight = 'P2067'
Plength = 'P2048'
numbers = '01234567890'
characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
space = ' '
m = 'Q11573'
cm = 'Q174728'
kg = 'Q11570'
g = 'Q41803'
lng = 'nl'

countries = {'ALB': 'Q222', 'AND': 'Q228', 'ARG': 'Q414', 'ARM': 'Q399', 'AUS': 'Q408', 'GER': 'Q183', 'ETH': 'Q115', 'CUB': 'Q241', 'CAN': 'Q16', 'BLR': 'Q184', 'USA': 'Q30', '': '', 'ISL': 'Q189', 'JPN': 'Q17', 'SWE': 'Q34',
             'GBR': 'Q145', 'KOR': 'Q884', 'NOR': 'Q20', 'BEL': 'Q31', 'HUN': 'Q28', 'EGY': 'Q79', 'NZL': 'Q664', 'ITA': 'Q38', 'MOZ': 'Q1029', 'CRO': 'Q224', 'DEN': 'Q35', 'IRQ': 'Q796', 'IRI': 'Q794', 'LTU': 'Q37', 'CZE': 'Q213',
             'TCH': 'Q33946', 'RSA': 'Q258', 'ROU': 'Q218', 'RUS': 'Q159', 'SLO': 'Q215', 'CHN': 'Q148', 'FRA': 'Q142', 'FIN': 'Q33', 'BRA': 'Q155', 'FRG': 'Q183', 'SUI': 'Q39', 'POR': 'Q45', 'IRL': 'Q27', 'NED': 'Q29999',
             'ISR': 'Q801', 'IND': 'Q668', 'GUA': 'Q774', 'MEX': 'Q96', 'MYA': 'Q836', 'KSA': 'Q851', 'CRC': 'Q800', 'SVK': 'Q214', 'POL': 'Q36', 'TUR': 'Q43', 'GEQ': 'Q983', 'NGR': 'Q1033', 'LAT': 'Q211', 'GEO': 'Q230',
             'UKR': 'Q212', 'ESP': 'Q29', 'GAB': 'Q1000', 'PUR': 'Q30', 'AUT': 'Q40', 'URS': 'Q15180', 'PAK': 'Q843', 'BUL': 'Q219', 'COL': 'Q739', 'THA': 'Q869', 'GDR': 'Q16957', 'URU': 'Q77', 'VEN': 'Q717', 'JAM': 'Q766',
             'LCA': 'Q760', 'SUD': 'Q1049', 'ZIM': 'Q954', 'EST': 'Q191', 'CHI': 'Q298', 'ALG': 'Q262', 'GRE': 'Q41', 'ANG': 'Q916', 'GUM': 'Q16635', 'LUX': 'Q32', 'BOL': 'Q750', 'KEN': 'Q114', 'MDV': 'Q826', 'MRI': 'Q1027',
             'PER': 'Q419', 'HKG': 'Q148', 'VIE': 'Q881', 'ROC': 'Q159', 'PRK': 'Q423', 'MAR': 'Q1028', 'UGA': 'Q1036', 'MNE': 'Q236', 'GRN': 'Q769', 'BUR': 'Q965', 'MAS': 'Q833', 'HON': 'Q783', 'BAH': 'Q778', 'BRN': 'Q398',
             'PLE': 'Q219060', 'MDA': 'Q217', 'ESA': 'Q792', 'FIJ': 'Q712', 'LBR': 'Q1014', 'LBN': 'Q822', 'UZB': 'Q265', 'KGZ': 'Q813', 'KAZ': 'Q232', 'GHA': 'Q117', 'SRB': 'Q403', 'SLE': 'Q1044', 'MLT': 'Q233', 'AZE': 'Q227',
             'TPE': 'Q865', 'YUG': 'Q36704', 'BIH': 'Q225', 'VNM': 'Q881', 'BOH': 'Q39193', 'TJK': 'Q863', 'ATH': 'Q844930', 'LIE': 'Q347', 'SCG': 'Q37024', 'PHI': 'Q928', 'SRI': 'Q854', 'CIV': 'Q1008', 'NEP': 'Q837', 'ZAM': 'Q953',
             'ARU': 'Q21203', 'LAO': 'Q819', 'UAE': 'Q878', 'GBS': 'Q1007', 'BAN': 'Q902', 'CAM': 'Q424', 'YEM': 'Q805', 'SEY': 'Q1042', 'SYR': 'Q858', 'SKN': 'Q763', 'IVB': 'Q145', 'RWA': 'Q1037', 'SEN': 'Q1041', 'QAT': 'Q846',
             'BER': 'Q23635', 'CMR': 'Q1009', 'ISV': 'Q11703', 'SAM': 'Q683', 'CYP': 'Q229', 'TUN': 'Q948', 'KIR': 'Q710', 'EUN': 'Q159', 'MAW': 'Q1020', 'MLI': 'Q912', 'ECU': 'Q736', 'NAM': 'Q1030', 'IOA': 'Q574', 'SUR': 'Q730',
             'ERI': 'Q986', 'MAD': 'Q1019', 'JOR': 'Q810', 'VAN': 'Q686', 'SAA': 'Q183', 'SPA': 'Q5690', 'MGL': 'Q711', 'TLS': 'Q574', 'RHO': 'Q217169', 'PAR': 'Q733', 'ANT': 'Q781', 'COK': 'Q26988', 'SGP': 'Q334', 'SMR': 'Q238',
             'CAF': 'Q929', 'GAM': 'Q1005', 'KUW': 'Q817', 'ANZ': 'Q408', 'BDI': 'Q967', 'BOT': 'Q963', 'CGO': 'Q971', 'DOM': 'Q786', 'AFG': 'Q889', 'TTO': 'Q754', 'LBA': 'Q1016', 'NCA': 'Q811', 'HAI': 'Q790', 'PNG': 'Q691',
             'TAN': 'Q924', 'TKM': 'Q874', 'GUY': 'Q734', 'BAR': 'Q244', 'FSM': 'Q702', 'KOS': 'Q1246', 'PLW': 'Q695', 'TOG': 'Q945', 'VIN': 'Q757', 'STP': 'Q45', 'ASA': 'Q30', 'BIZ': 'Q242', 'MON': 'Q235', 'INA': 'Q252',
             'MAK': 'Q83958', 'SWZ': 'Q1050', 'PAN': 'Q804', 'BRU': 'Q921', 'COD': 'Q974', 'BEN': 'Q962', 'OMA': 'Q842', 'CHA': 'Q657', 'TUV': 'Q672', 'TGA': 'Q678', 'AHO': 'Q29999','LES': 'Q1013', 'NRU': 'Q697',
             'GUI': 'Q1006', 'BHU': 'Q917', 'SOL': 'Q685', 'DMA': 'Q784', 'SOM': 'Q1045', 'DJI': 'Q977', 'MKD': 'Q221', 'CAY': 'Q145', 'UAR': 'Q79', 'CPV': 'Q1011', 'MHL': 'Q709', 'EOR': 'Q958', 'COM': 'Q970',
             'NIG': 'Q1032', 'COR': 'Q884','EPH': 'Q1747689', 'MAL': 'Q833', 'ROM': 'Q1747689', 'YMD': 'Q199841', 'WIF': 'Q754', 'YAR': 'Q267584', 'MTN': 'Q1025'}

def wd_sparql_query(spq):
    wikidatasite = pywikibot.Site('wikidata', 'wikidata')
    generator = pg.WikidataSPARQLPageGenerator(spq, site=wikidatasite)
    for wd in generator:
        if (wd.exists()):
            wd.get(get_redirect=True)
            yield wd


def parseWeightLength(string):
    try:
        string += ' x x x '
        weight = length = 0
        wstr = lstr = ''
        w_unit = l_unit = rw_unit = rlunit = ''
        i = 0
        while string[i] in numbers:
            lstr += string[i]
            i += 1
        while string[i] in [space]:
            i += 1
            # print('lstr',lstr,i)
        while string[i] not in [' ', '/', characters]:
            l_unit += string[i]
            i += 1
            # print('l_unit',l_unit,i)
        while string[i] in [space, ' ', '/', '-']:
            i += 1
            # print('x',i)
        while string[i] in numbers:
            wstr += string[i]
            i += 1
            # print('wstr',wstr)
        while string[i] in [space]:
            i += 1
            # print('z',i)
        while ((i < len(string)) & (string[i] not in [' ', '/', characters])):
            w_unit += string[i]
            if (i < len(string)):
                i += 1
            # print('wunit',i,w_unit,len(string))

        if l_unit == 'cm':
            rl_unit = cm
        elif l_unit == 'm':
            rl_unit = m
        else:
            return(0, '', 0, '')

        if w_unit == 'g':
            rw_unit = g
        elif w_unit == 'kg':
            rw_unit = kg
        else:
            return(0, '', 0, '')

        while string[i] in numbers:
            lstr = lstr+string[i]
        #print('[%s][%s]-[%s][%s]' % (wstr,w_unit,lstr,l_unit))
        if wstr == '':
            wstr = '0'
        if lstr == '':
            lstr = '0'
        return (int(wstr), rw_unit, int(lstr), rl_unit)
    except:
        return null, null, null, null


def makesrc():
    source_claim = pywikibot.Claim(repo, 'P248', is_reference=True)
    source_claim.setTarget(pywikibot.ItemPage(repo, 'Q95606922'))
    return(source_claim)


def newClaim(wd, P, value, unit, summary):
    # target=pywikibot.WbQuantity(value,pywikibot.ItemPage(repo,unit),0.1,site=site)
    target = pywikibot.WbQuantity(
        value, pywikibot.ItemPage(repo, unit), site=site)
    claim = pywikibot.Claim(repo, P)
    claim.setTarget(target)
    claim.addSources([makesrc()])
    wd.addClaim(claim, summary=summary)


def simpleway(wd, table):
    row = 0
    hdr = [td.get_text() for td in table.findAll('th')]
    for rowdata in table.findAll("tr"):
        cells = rowdata.findAll("td")
        for x in range(0, len(cells)):
            if (hdr[row] == 'Measurements'):
                weight, wunit, length, lunit = parseWeightLength(
                    cells[x].find(text=True))
                # print(weight,wunit,length,lunit)

                if (not Pweight in wd.claims) and (weight > 30):
                    newClaim(wd, Pweight, weight, wunit,
                             'add weight from Olympedia')
                if (not Plength in wd.claims) and (length > 90):
                    newClaim(wd, Plength, length, lunit,
                             'add length from Olympedia')
            elif (hdr[row] == 'Type'):
                pass
            elif (hdr[row] == 'Full name'):
                fullname = cells[x].find(text=True).replace('•', ' ')
                updateOneAlias(wd, lng, fullname)
            elif (hdr[row] == 'Used name'):
                usedname = cells[x].find(text=True).replace('•', ' ')
                updateOneAlias(wd, lng, usedname)
            elif (hdr[row] == 'Other names'):
                othername = cells[x].find(text=True).replace('•', ' ')
                updateOneAlias(wd, lng, othername)
            elif (hdr[row] == 'Original name'):
                originalname = cells[x].find(text=True).replace('•', ' ')
                updateOneAlias(wd, lng, originalname)
            elif (hdr[row] == 'Nick/petnames'):
                for nick in cells[x].find(text=True).split(','):
                    addNickName(wd, nick)
            elif (hdr[row] == 'NOC'):
                countryname = cells[x].find()
                if (not('P27' in wd.claims)):
                    addCountry(wd, countryname)
                else:
                    listCountry(wd, countryname)
            elif (hdr[row] == 'Affiliations'):
                pass
            elif (hdr[row] == 'Died'):
                if (not 'P570' in wd.claims):
                    DateClaim(wd, 'P570', cells[x].find(text=True))
            elif (hdr[row] == 'Born'):
                if (not 'P569' in wd.claims):
                    DateClaim(wd, 'P569', cells[x].find(text=True))
            elif (hdr[row] == 'Sex'):
                if (not('P21') in wd.claims):
                    sexe = cells[x].find(text=True)
                    if (sexe == 'Female'):
                        addFemale(wd)
                    elif (sexe == 'Male'):
                        addMale(wd)
            else:
                print(hdr[row], '––', cells[x].find(text=True))
            row += 1
def addMale(wd):
    print('addMale')
    target=pywikibot.ItemPage(repo,'Q6581097')
    claim=pywikibot.Claim(repo,'P21')
    claim.setTarget(target)
    claim.addSources([makesrc()])
    wd.addClaim(claim,summary='add sexe from Olympedia')

def addFemale(wd):
    print('addFemale')
    target=pywikibot.ItemPage(repo,'Q6581072')
    claim=pywikibot.Claim(repo,'P21')
    claim.setTarget(target)
    claim.addSources([makesrc()])
    wd.addClaim(claim,summary='add sexe from Olympedia')

def addCountry(wd, countryname):
    claim = pywikibot.Claim(repo, 'P27')
    cntr = countryname.get('src').replace(
        '/images/flags/', '').replace('.png', '')
    print(cntr)
    if (cntr in countries):
        target = pywikibot.ItemPage(repo, countries[cntr])
        claim.setTarget(target)
        claim.addSources([makesrc()])
        wd.addClaim(claim, summary='add nationality from Olympedia')
    else:
        print('missing %1 in countries' % cntr)


def updateOneAlias(wd, lng, alias):
    if lng in wd.labels:
        if wd.labels[lng] == alias:
            return  # little need to add label as alias
    wd.get(get_redirect=True)
    newalias = []
    if (lng in wd.aliases):
        newalias = wd.aliases[lng]
        if not(alias in wd.aliases[lng]):
            newalias.append(alias)
        else:
            return
    else:
        newalias.append(alias)
    wd.editEntity({'aliases': {lng: newalias}},
                  summary=f'---add from Olympedia alias for {lng}')
    print('Add alias %s' % alias)


def addNickName(wd, nickname):

    if ('P1449' in wd.claims):
        return
        for claim in wd.claims['P1449']:
            if claim.getTarget().text == nickname:
                return

    claim = pywikibot.Claim(repo, 'P1449')
    target = pywikibot.WbMonolingualText(text=nickname, language='en')
    claim.setTarget(target)
    claim.addSources([makesrc()])
    wd.addClaim(claim, summary='Nickname from Olympedia')


def Olympedia(wd):
    try:
        f = urlopen('https://www.olympedia.org/athletes/%s' %
                    wd.claims['P8286'][0].getTarget())
    except:
        return
    htmltext = f.read().decode('utf-8')
    soup = BeautifulSoup(htmltext)
    table = soup.find("table", attrs={"class": "biodata"})
    simpleway(wd, table)


def allOlympedians():
    #for wd in wd_sparql_query('select ?item where {?item wdt:P8286 ?o; wdt:P27 ?l. ?item wdt:P106 wd:Q11513337; wdt:P27 wd:Q29999} '):
    for wd in wd_sparql_query('select ?item where {?item wdt:P8286 ?o; wdt:P27 ?l}'):
        print('wd: ', wd.title())
        Olympedia(wd)


def listCountry(wd, countryname):
    try:
      cntr = countryname.get('src').replace('/images/flags/', '').replace('.png', '')
    except:
      cntr = None    
    if (cntr!=None) and (not(cntr in countries)):
        if 'P27' in wd.claims:
            if len(wd.claims['P27']) == 1:
                country = wd.claims['P27'][0].getTarget().title()
                countries.update({cntr: country})
                print(countries)  # copy-paste into source code


def DateClaim(wd, P, DateStr):
    try:
        print('Begin')
        date = parser.parse(DateStr.replace(' in ', ''))
        print('OK-!')
    except:
        print('unknown date format, skipped: ', DateStr)
        return  # no valid date extracted
    target = pywikibot.WbTime(date.year, date.month, date.day)
    claim = pywikibot.Claim(repo, P)
    claim.setTarget(target)
    claim.addSources([makesrc()])
    wd.addClaim(claim, summary='date from Olympedia')


#DateClaim('','P570','15 August 1980')
#item=pywikibot.ItemPage(repo,'Q31295074')
#Olympedia(item)
allOlympedians()