User:Edoderoobot/WDcyrillic2latin
Jump to navigation
Jump to search
import pywikibot
from pywikibot import pagegenerators
import pywikibot.data.wikidataquery as wdquery
import codecs #used in logfiles, unicoded strings
import sys
import datetime
from datetime import datetime, date, time
debugedo=True
debugedo=False
CLAIM_is_a = 'P31'
CLAIMED_person='Q5'
default_query='claim[31:5] and claim[27:159] and link[ruwiki]' #a Russian person, has Russian article
default_language = 'nl'
ru_cyrilic2latin = { 'а':'a','б':'b','':'c','д':'d','е':'e','ф':'f','г':'g','х':'h','и':'i','ю':'ju','к':'k','л':'l','м':'m','н':'n','о':'o','п':'p','':'q','р':'r','с':'s','т':'t','':'u','в':'v','':'w','':'x','й':'y','з':'z',
'А':'A','Б':'B','':'C','Д':'D','Э':'E','Ф':'F','Г':'G','Х':'H','И':'I','Ю':'Ju','К':'K','Л':'L','М':'M','Н':'N','О':'O','П':'P','':'Q','Р':'R','С':'S','Т':'T','':'U','В':'V','':'W','':'X','Й':'Y','З':'Z',
'ж':'zj','э':'e','л':'l',' ':' ','?':'?','ш':'sj','ч':'tsj',',':',','у':'oe','ы':'i','Ы':'I','Щ':'Sjtsj','щ':'sjtsj',
'х':'ch','У':'Oe','ц':'ts','-':'-','Ж':'Zj','Я':'Ja','я':'ja','Ш':'Sj','Ч':'Tsj','Ё':'Jo','ё':'jo','+':'+','=':'=','/':'/','Ц':'Ts','Е':'E','ь':'','.':'.',
'a':'a','b':'b','c':'c','d':'d','e':'e','f':'f','g':'g','h':'h','i':'i','j':'j','k':'k','l':'l','m':'m','n':'n','o':'o','p':'p','q':'q','r':'r','s':'s','t':'t','u':'u','v':'v','w':'w','x':'x','y':'y','z':'z',
'A':'A','B':'B','C':'C','D':'D','E':'E','F':'F','G':'G','H':'H','I':'I','J':'J','K':'K','L':'L','M':'M','N':'N','O':'O','P':'P','Q':'Q','R':'R','S':'S','T':'T','U':'U','V':'V','W':'W','X':'X','Y':'Y','Z':'Z',
'1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','0':'0','(':'(',')':')','$':'$','\'':'\'',
'':''}
sr_cyrilic2latin = {'а':'a','б':'b','в':'v','г':'g','д':'d','ђ':'dj','е':'e','ж':'zj','з':'z','и':'i','ј':'j','к':'k','л':'l','љ':'lj','м':'m','н':'n','њ':'nj','о':'o','п':'p','р':'r','с':'s','т':'t','ћ':'ć','у':'u','ф':'f','х':'h','ц':'c','ч':'č','џ':'dž','ш':'š',
'А':'A','Б':'B','В':'V','Г':'G','Д':'D','Ђ':'Dj','Е':'E','Ж':'Zj','З':'Z','И':'I','Ј':'J','К':'K','Л':'L','Љ':'Lj','М':'M','Н':'N','Њ':'Nj','О':'O','П':'P','Р':'R','С':'S','Т':'T','Ћ':'Ć','У':'U','Ф':'F','Х':'H','Ц':'C','Ч':'Č','Џ':'Dž','Ш':'Š',
'a':'a','b':'b','c':'c','d':'d','e':'e','f':'f','g':'g','h':'h','i':'i','j':'j','k':'k','l':'l','m':'m','n':'n','o':'o','p':'p','q':'q','r':'r','s':'s','t':'t','u':'u','v':'v','w':'w','x':'x','y':'y','z':'z',
'A':'A','B':'B','C':'C','D':'D','E':'E','F':'F','G':'G','H':'H','I':'I','J':'J','K':'K','L':'L','M':'M','N':'N','O':'O','P':'P','Q':'Q','R':'R','S':'S','T':'T','U':'U','V':'V','W':'W','X':'X','Y':'Y','Z':'Z',
'1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','0':'0','(':'(',')':')','$':'$','\'':'\'','-':'-','.':'.','—':'—',' ':' ',
'':''}
def log_premature(itemno):
with codecs.open("cyrillic-description.prelog.csv","a", encoding="utf-8") as logfile:
logfile.write('%s\n' % (itemno))
logfile.close
def logme(verbose, formatstring, *parameters):
with codecs.open("cyrillic-description.log.csv", "a", encoding="utf-8") as logfile:
#formattedstring = u'%s%s' % (formatstring.encode('utf-8'), '\n')
formattedstring = u'%s%s' % (formatstring, '\n')
try:
logfile.write(formattedstring % (parameters) )
except :
exctype, value = sys.exc_info()[:2]
verbose = True #now I want to see what!
logfile.close()
if verbose:
print(formatstring % (parameters))
def logsame(one,two,three):
with codecs.open("same.cyrilic.csv", "a", encoding="utf-8") as logfile:
try:
logfile.write("%s|%s|%s\n" % (one,two,three))
except :
exctype, value = sys.exc_info()[:2]
print("1) Error writing to logfile on: [%s] [%s]" % (exctype, value))
verbose = True #now I want to see what!
logfile.close()
def logdiff(Qqqq,one,two,three):
with codecs.open("diff.cyrilic.csv", "a", encoding="utf-8") as logfile:
try:
logfile.write("%s|%s|%s|%s\n" % (Qqqq,one,two,three))
except :
exctype, value = sys.exc_info()[:2]
print("1) Error writing to logfile on: [%s] [%s]" % (exctype, value))
verbose = True #now I want to see what!
logfile.close()
def lognew(Qqqq,one,two,three):
with codecs.open("new.cyrilic.csv", "a", encoding="utf-8") as logfile:
try:
logfile.write("%s|%s|%s|%s\n" % (Qqqq,one,two,three))
except :
exctype, value = sys.exc_info()[:2]
print("1) Error writing to logfile on: [%s] [%s]" % (exctype, value))
verbose = True #now I want to see what!
logfile.close()
class WDBot():
"""
A bot to add streets on Wikidata
"""
def __init__(self, generator):
"""
Arguments:
* generator - A generator that yields itempage objects.
"""
self.generator = generator
self.repo = pywikibot.Site().data_repository()
def run(self,srclng,destlng):
"""
Starts the robot.
"""
site = pywikibot.getSite(srclng)
repo = site.data_repository()
items_found=0
for WDIquery in self.generator:
if items_found> 9999999 :
break
if WDIquery.exists() :
#log_premature(WDIquery.title()) #log which item we process ... in case of an error, I know which item it is
WDIquery.get(get_redirect=True)
items_found += action_one_item(repo,WDIquery,srclng,destlng)
def WikidataQueryItemPageGenerator(query, site=None):
"""Generate pages that result from the given WikidataQuery.
@param query: the WikidataQuery query string.
"""
global items2do
if site is None:
site = pywikibot.Site()
repo = site.data_repository()
wd_queryset = wdquery.QuerySet(query)
wd_query = wdquery.WikidataQuery(cacheMaxAge=0)
data = wd_query.query(wd_queryset)
items2do = data[u'status'][u'items']
pywikibot.output(u'retrieved %d items' % data[u'status'][u'items'])
for item in data[u'items']:
yield pywikibot.ItemPage(repo, u'Q' + str(item))
def convert_cyrilic2latin(cyrilic2latin, fromchar):
if fromchar in cyrilic2latin:
return cyrilic2latin[fromchar]
else:
return (fromchar)
def normalize_name_of_person(name):
posfound = name.find(',')
if posfound==0:
return name
if name[posfound+1:posfound+2]==' ':
first = name[posfound+1:len(name)]
last = name[0:posfound]
result = first.strip()+' '+last
return result
return name
def action_one_item(repo, wditem, srclng, destlng):
items_found = 0
placefound = u'WDcyrillic2latin'
my_description = u''
orig_desc = u''
if not srclng in wditem.labels:
return items_found
type=''
if (CLAIM_is_a in wditem.claims):
type = wditem.claims.get(CLAIM_is_a)[0].getTarget().title()
lng_desc=u''
if (type==CLAIMED_person):
src_desc = normalize_name_of_person(wditem.labels[srclng])
else :
src_desc = wditem.labels[srclng]
conv_desc = str_cyrilic2latin(sr_cyrilic2latin,src_desc)
if debugedo:
print ("[%s]-[%s] == \n[%s]\n[%s]" % (src_desc,conv_desc,src_desc.encode('utf-8'),conv_desc.encode('utf-8')))
if (destlng in wditem.labels):
lng_desc = wditem.labels[destlng] #there is already a description in the target language ... we do not update usually
#if (lng_desc==conv_desc):
# logsame(src_desc,conv_desc,lng_desc)
#else:
# logdiff(wditem.title(),src_desc,conv_desc,lng_desc)
return items_found
else: #the target language is blank ...
#lognew(wditem.title(),src_desc,conv_desc,lng_desc)
try :
if not (destlng in wditem.labels):
if srclng+'wiki' in wditem.sitelinks: #check if link still exist
wditem.labels[destlng] = wditem.sitelinks[srclng+'wiki']
data = {}
#data.update( {'labels': { destlng: wditem.labels[srclng]}, 'descriptions': {destlng:my_description} } )
data.update( {'labels': { destlng: conv_desc} } )
wditem.editEntity(data,summary=u'cyrillic2latin-label, python code on https://goo .gl/L3vh0e , logfile on https://goo .gl/BezTim')
except ValueError:
logme(False, "ValueError occured on %s",wditem.title())
except :
logme(False, "Undefined error occured on %s: srclng:%s %s",wditem.title(),srclng,conv_desc)
else :
logme(False, u'%s|%s|%s|%s|%s|%s',datetime.now().strftime("%Y-%b-%d/%H:%M:%S"),wditem.title(),srclng,src_desc,conv_desc,placefound)
items_found += 1
return items_found
def str_cyrilic2latin(cyrilic2latin, instring):
y=''
for i in char_by_char(cyrilic2latin, instring):
y = y + i
return(y)
def char_by_char(cyrilic2latin, instring) :
for i in range( 0 , len(instring)):
yield(convert_cyrilic2latin(cyrilic2latin,instring[i]))
def test():
site = pywikibot.Site()
repo = site.data_repository()
item2load = u'Q346006' #Ada
print("start")
x = pywikibot.ItemPage(repo, item2load)
x.get(get_redirect=True)
action_one_item(repo, x, default_language)
def main():
destlng = 'nl'
srclng = 'sr'
print ("main")
query = default_query #later, I want to manage this with params
#query = 'claim[31:515] and claim[17:403]' #town in Serbia
#query = 'claim[31:5] and claim[27:159]' #people from Russia
#query = 'link[srwiki] and claim[17:403]' #link on sr-wiki and country=Srbia #6510 items
query = 'link[srwiki] and claim[495:403]' #country of origin=Srbia #xxxx items
pigenerator = pagegenerators.PreloadingItemGenerator(pagegenerators.WikidataItemGenerator(WikidataQueryItemPageGenerator(query)))
wikidataBot = WDBot(pigenerator)
wikidataBot.run(srclng,destlng)
print("\n")
if __name__ == "__main__":
if debugedo:
print("debug is on")
test()
else:
print("Klaar voor de start")
main()