User:Edoderoobot/WDcyrillic2latin

From Wikidata
Jump to navigation Jump to search
import pywikibot
from pywikibot import pagegenerators
import pywikibot.data.wikidataquery as wdquery
import codecs #used in logfiles, unicoded strings
import sys
import datetime
from datetime import datetime, date, time


debugedo=True
debugedo=False

CLAIM_is_a = 'P31'
CLAIMED_person='Q5'
default_query='claim[31:5] and claim[27:159] and link[ruwiki]' #a Russian person, has Russian article
default_language = 'nl'

ru_cyrilic2latin = { 'а':'a','б':'b','':'c','д':'d','е':'e','ф':'f','г':'g','х':'h','и':'i','ю':'ju','к':'k','л':'l','м':'m','н':'n','о':'o','п':'p','':'q','р':'r','с':'s','т':'t','':'u','в':'v','':'w','':'x','й':'y','з':'z',
                     'А':'A','Б':'B','':'C','Д':'D','Э':'E','Ф':'F','Г':'G','Х':'H','И':'I','Ю':'Ju','К':'K','Л':'L','М':'M','Н':'N','О':'O','П':'P','':'Q','Р':'R','С':'S','Т':'T','':'U','В':'V','':'W','':'X','Й':'Y','З':'Z',
                     'ж':'zj','э':'e','л':'l',' ':' ','?':'?','ш':'sj','ч':'tsj',',':',','у':'oe','ы':'i','Ы':'I','Щ':'Sjtsj','щ':'sjtsj',
                     'х':'ch','У':'Oe','ц':'ts','-':'-','Ж':'Zj','Я':'Ja','я':'ja','Ш':'Sj','Ч':'Tsj','Ё':'Jo','ё':'jo','+':'+','=':'=','/':'/','Ц':'Ts','Е':'E','ь':'','.':'.',
                     'a':'a','b':'b','c':'c','d':'d','e':'e','f':'f','g':'g','h':'h','i':'i','j':'j','k':'k','l':'l','m':'m','n':'n','o':'o','p':'p','q':'q','r':'r','s':'s','t':'t','u':'u','v':'v','w':'w','x':'x','y':'y','z':'z',
                     'A':'A','B':'B','C':'C','D':'D','E':'E','F':'F','G':'G','H':'H','I':'I','J':'J','K':'K','L':'L','M':'M','N':'N','O':'O','P':'P','Q':'Q','R':'R','S':'S','T':'T','U':'U','V':'V','W':'W','X':'X','Y':'Y','Z':'Z',
                     '1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','0':'0','(':'(',')':')','$':'$','\'':'\'',
                 '':''}
                
sr_cyrilic2latin = {'а':'a','б':'b','в':'v','г':'g','д':'d','ђ':'dj','е':'e','ж':'zj','з':'z','и':'i','ј':'j','к':'k','л':'l','љ':'lj','м':'m','н':'n','њ':'nj','о':'o','п':'p','р':'r','с':'s','т':'t','ћ':'ć','у':'u','ф':'f','х':'h','ц':'c','ч':'č','џ':'dž','ш':'š',
                    'А':'A','Б':'B','В':'V','Г':'G','Д':'D','Ђ':'Dj','Е':'E','Ж':'Zj','З':'Z','И':'I','Ј':'J','К':'K','Л':'L','Љ':'Lj','М':'M','Н':'N','Њ':'Nj','О':'O','П':'P','Р':'R','С':'S','Т':'T','Ћ':'Ć','У':'U','Ф':'F','Х':'H','Ц':'C','Ч':'Č','Џ':'Dž','Ш':'Š',
                     'a':'a','b':'b','c':'c','d':'d','e':'e','f':'f','g':'g','h':'h','i':'i','j':'j','k':'k','l':'l','m':'m','n':'n','o':'o','p':'p','q':'q','r':'r','s':'s','t':'t','u':'u','v':'v','w':'w','x':'x','y':'y','z':'z',
                     'A':'A','B':'B','C':'C','D':'D','E':'E','F':'F','G':'G','H':'H','I':'I','J':'J','K':'K','L':'L','M':'M','N':'N','O':'O','P':'P','Q':'Q','R':'R','S':'S','T':'T','U':'U','V':'V','W':'W','X':'X','Y':'Y','Z':'Z',
                     '1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','0':'0','(':'(',')':')','$':'$','\'':'\'','-':'-','.':'.','—':'—',' ':' ',
                 '':''}                

def log_premature(itemno):
  with codecs.open("cyrillic-description.prelog.csv","a", encoding="utf-8") as logfile:
    logfile.write('%s\n' % (itemno))
  logfile.close
 
 
def logme(verbose, formatstring, *parameters):
  with codecs.open("cyrillic-description.log.csv", "a", encoding="utf-8") as logfile:
    #formattedstring = u'%s%s' % (formatstring.encode('utf-8'), '\n')
    formattedstring = u'%s%s' % (formatstring, '\n')
    try:  
      logfile.write(formattedstring % (parameters) )
    except :
      exctype, value = sys.exc_info()[:2]
      verbose = True    #now I want to see what!  
    logfile.close()

  if verbose:
    print(formatstring % (parameters)) 


def logsame(one,two,three):
  with codecs.open("same.cyrilic.csv", "a", encoding="utf-8") as logfile:
    try:
      logfile.write("%s|%s|%s\n" % (one,two,three))
    except :
      exctype, value = sys.exc_info()[:2]
      print("1) Error writing to logfile on: [%s] [%s]" % (exctype, value))
      verbose = True    #now I want to see what!  
    logfile.close()
     

def logdiff(Qqqq,one,two,three):
  with codecs.open("diff.cyrilic.csv", "a", encoding="utf-8") as logfile:
    try:
      logfile.write("%s|%s|%s|%s\n" % (Qqqq,one,two,three))
    except :
      exctype, value = sys.exc_info()[:2]
      print("1) Error writing to logfile on: [%s] [%s]" % (exctype, value))
      verbose = True    #now I want to see what!  
    logfile.close()

def lognew(Qqqq,one,two,three):
  with codecs.open("new.cyrilic.csv", "a", encoding="utf-8") as logfile:
    try:
      logfile.write("%s|%s|%s|%s\n" % (Qqqq,one,two,three))
    except :
      exctype, value = sys.exc_info()[:2]
      print("1) Error writing to logfile on: [%s] [%s]" % (exctype, value))
      verbose = True    #now I want to see what!  
    logfile.close()

   
class WDBot():
    """
    A bot to add streets on Wikidata
    """
    def __init__(self, generator):
        """
        Arguments:
            * generator    - A generator that yields itempage objects.
        """
        self.generator = generator
        self.repo = pywikibot.Site().data_repository()

    def run(self,srclng,destlng):
        """
        Starts the robot.
        """
        site = pywikibot.getSite(srclng)
        repo = site.data_repository()
       
        items_found=0
        for WDIquery in self.generator:
            if items_found> 9999999 :
               break
            if WDIquery.exists() :
                #log_premature(WDIquery.title())   #log which item we process ... in case of an error, I know which item it is
                WDIquery.get(get_redirect=True)
                items_found += action_one_item(repo,WDIquery,srclng,destlng)
       

def WikidataQueryItemPageGenerator(query, site=None):
    """Generate pages that result from the given WikidataQuery.
    @param query: the WikidataQuery query string.
    """
   
    global items2do
   
    if site is None:
        site = pywikibot.Site()
    repo = site.data_repository()

    wd_queryset = wdquery.QuerySet(query)

    wd_query = wdquery.WikidataQuery(cacheMaxAge=0)
    data = wd_query.query(wd_queryset)

    items2do = data[u'status'][u'items']
    pywikibot.output(u'retrieved %d items' % data[u'status'][u'items'])
    for item in data[u'items']:
        yield pywikibot.ItemPage(repo, u'Q' + str(item))

def convert_cyrilic2latin(cyrilic2latin, fromchar):
    if fromchar in cyrilic2latin:
      return cyrilic2latin[fromchar]
    else:
      return (fromchar)
       
def normalize_name_of_person(name):
  posfound = name.find(',')
  if posfound==0:
    return name
  if name[posfound+1:posfound+2]==' ':
    first = name[posfound+1:len(name)]
    last  = name[0:posfound]
    result = first.strip()+' '+last
    return result
  return name 
       
def action_one_item(repo, wditem, srclng, destlng):
  items_found = 0
  placefound = u'WDcyrillic2latin'
  my_description = u''
  orig_desc = u''
  if not srclng in wditem.labels:
    return items_found
  type=''
  if (CLAIM_is_a in wditem.claims):
    type = wditem.claims.get(CLAIM_is_a)[0].getTarget().title()
  lng_desc=u''
  if (type==CLAIMED_person):
    src_desc  = normalize_name_of_person(wditem.labels[srclng])
  else :
    src_desc = wditem.labels[srclng]
   
  conv_desc = str_cyrilic2latin(sr_cyrilic2latin,src_desc)

  if debugedo:
    print ("[%s]-[%s] == \n[%s]\n[%s]" % (src_desc,conv_desc,src_desc.encode('utf-8'),conv_desc.encode('utf-8')))

  if (destlng in wditem.labels):
    lng_desc = wditem.labels[destlng]   #there is already a description in the target language ... we do not update usually
    #if (lng_desc==conv_desc):
    #  logsame(src_desc,conv_desc,lng_desc)
    #else:
    #  logdiff(wditem.title(),src_desc,conv_desc,lng_desc)
     
    return items_found
  else:  #the target language is blank ...
      #lognew(wditem.title(),src_desc,conv_desc,lng_desc)
      try :
          if not (destlng in wditem.labels):
            if srclng+'wiki' in wditem.sitelinks: #check if link still exist
              wditem.labels[destlng] = wditem.sitelinks[srclng+'wiki']
          data = {}
          #data.update( {'labels': { destlng: wditem.labels[srclng]}, 'descriptions': {destlng:my_description} } )
          data.update( {'labels': { destlng: conv_desc} } )
          wditem.editEntity(data,summary=u'cyrillic2latin-label, python code on https://goo .gl/L3vh0e , logfile on https://goo .gl/BezTim')
      except ValueError:
          logme(False, "ValueError occured on %s",wditem.title())
      except :
          logme(False, "Undefined error occured on %s: srclng:%s %s",wditem.title(),srclng,conv_desc)
      else :
          logme(False, u'%s|%s|%s|%s|%s|%s',datetime.now().strftime("%Y-%b-%d/%H:%M:%S"),wditem.title(),srclng,src_desc,conv_desc,placefound)

      items_found += 1
      return items_found

def str_cyrilic2latin(cyrilic2latin, instring):
  y=''
  for i in char_by_char(cyrilic2latin, instring):
    y = y + i
  return(y) 
   
   
def char_by_char(cyrilic2latin, instring) :
  for i in range( 0 , len(instring)):
   yield(convert_cyrilic2latin(cyrilic2latin,instring[i]))

 
       
def test():

 site = pywikibot.Site()
 repo = site.data_repository()
 item2load = u'Q346006' #Ada
 
 print("start")
 x = pywikibot.ItemPage(repo, item2load)
 x.get(get_redirect=True)
 action_one_item(repo, x, default_language)

 
def main():
    destlng = 'nl'
    srclng = 'sr'
    print ("main")
    query = default_query #later, I want to manage this with params
    #query = 'claim[31:515] and claim[17:403]' #town in Serbia
    #query = 'claim[31:5] and claim[27:159]' #people from Russia
    #query = 'link[srwiki] and claim[17:403]' #link on sr-wiki and country=Srbia  #6510 items
    query = 'link[srwiki] and claim[495:403]' #country of origin=Srbia  #xxxx items
   
    pigenerator = pagegenerators.PreloadingItemGenerator(pagegenerators.WikidataItemGenerator(WikidataQueryItemPageGenerator(query)))
   
    wikidataBot = WDBot(pigenerator)
    wikidataBot.run(srclng,destlng)
    print("\n")

 
if __name__ == "__main__": 
 if debugedo:
    print("debug is on")
    test()
 else:
  print("Klaar voor de start")
  main()