Wikidata:Database reports/top missing properties by number of sitelinks/Configuration

From Wikidata
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#based on Pasleim's other reports

import MySQLdb
import pywikibot
import time
import urllib2
import json
import re


site = pywikibot.Site('wikidata','wikidata')
repo = site.data_repository()

properties = ('P364','P1476','P577','P21', 'P26', 'P1412', 'P22', 'P551', 'P345','P161','P57','P162','P915','P58','P119','P1196','P570','P195','P170','P186','P180','P31','P105','P171')  #properties for which the reports are to be created (sql below)
reportscurrentlydeactivated = ('P106', 'P735', 'P569', 'P214') 
validateQ5 = ('P569', 'P21', 'P735', 'P106', 'P26', 'P214', 'P1412', 'P22','P119','P551','P1196','P570')  #properties for which the sql result needs to be checked for existence of P31=Q5
validateQ11424 = ('P364','P1476','P577','P345','P161','P57','P162','P915','P58','P364')  # 'P364','P1476','P577', checked for films only
validateQ16521 = ('P105','P171')
validateartwork = ('P195','P170','P186','P180')  # start with CLAIM[31:3305213]
validateP31 = ('P31')        #link table outdated?
validateproperty = ('P646')  #link table outdated?
maxreportlength = 488
minvalue = 0
maxvalue = 0
itemlist = ''
ignoreit = ('Q4617','Q1362169','Q487612','Q4115189','Q13406268','Q15397819','Q302')

def makeReport(db, pproperty):

    if pproperty == 'P31':
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM wb_items_per_site, wb_entity_per_page WHERE epp_page_id NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P31","P279") AND pl_namespace = 120) AND epp_entity_id = ips_item_id GROUP BY epp_page_id HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 900'

    if pproperty == 'P646':  #historic, slow/to be deactivated after initial run
        query1 = ''

    elif pproperty == 'P21':  # sex/gender
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P31" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P21" AND pl_namespace = 120) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q13406463" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q4167836" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q41710" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q225469" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q8047" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P360" AND pl_namespace = 120) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q28640" AND pl_namespace = 0) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >2 ORDER BY COUNT(ips_site_id) DESC LIMIT 800'

    elif pproperty == 'P22':   # family
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P22","P40","P7","P25","P9","P53","P1038","P451") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 650' 
    elif pproperty == 'P106':  # activities
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P106","P39","P97") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 650'
    elif pproperty == 'P551':   # locations
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P19","P20","P551","P119","P937") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 650' 
    elif pproperty == 'P569':   # dates
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title In ("P569","P570","P1317") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
    elif pproperty == 'P735':  # first name
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P735" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 1000'
    elif pproperty == 'P1412':   # languages 
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P1412" AND pl_namespace = 120) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P103" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
    elif pproperty == 'P119': #with P1442 only
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P1442" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P119" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
    elif pproperty == 'P26': #with P570 only
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P570" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P26" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
    elif pproperty == 'P1196':
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P509" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P1196","P1347") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
    if pproperty == 'P570':
        query1 = 'SELECT pp_value, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM page, wb_entity_per_page, wb_items_per_site, enwiki_p.categorylinks JOIN enwiki_p.page_props ON pp_page=cl_from WHERE ( (cl_to LIKE "1%" AND cl_to RLIKE "1[0-9]{3}_deaths") OR (cl_to LIKE "20%" AND cl_to RLIKE "20(0[0-9]|1[0-4])_deaths") ) AND pp_propname="wikibase_item" AND cl_type="page" AND pp_value NOT IN (SELECT page_title FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P570","P746") AND pl_namespace = 120) AND pp_value = page_title AND epp_entity_id = ips_item_id AND epp_entity_type = "item" AND page_id = epp_page_id AND page_namespace = 0 GROUP BY pp_value HAVING COUNT(ips_site_id) >2 ORDER BY COUNT(ips_site_id) DESC LIMIT 800'

    elif pproperty == 'P214':   # people
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P214","P54","P641","P345","P434","P1047","P1186","P102") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 700' 
    elif pproperty == 'P345':   # films
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=0 AND pl1.pl_title="Q11424" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P345","P971","P360","P301","P279") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 850'    

    elif pproperty in validateartwork:
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=0 AND pl1.pl_title="Q3305213" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("'+pproperty+'","P279") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
    elif pproperty in validateQ16521:
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P225" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("'+pproperty+'") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 550'
    elif pproperty in validateQ11424:
        query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=0 AND pl1.pl_title="Q11424" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("'+pproperty+'","P279") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 850'
    cursor = db.cursor()
    cursor.execute(query1)
    text = ''
    table_row = '{{{{TR missing P|{0}|{1}|{2}|{3}}}}}\n'
    reportlength = 0
    skipped = 0
    for q, cnt, sites in cursor:
        skipped +=1
        data = json.load(urllib2.urlopen('https://www.wikidata.org/w/api.php?action=wbgetentities&ids=%s&props=labels|claims&format=json' % q))
        if pproperty in validateQ5:        
            if 'claims' in data['entities'][q]:
                if 'P279' in data['entities'][q]['claims']:
                    validity = 0
                    continue
                if 'P31' in data['entities'][q]['claims']:
                    validity = 0
                    for m in data['entities'][q]['claims']['P31']:
                        if m['mainsnak']['datavalue']['value']['numeric-id'] == 5:
                            validity = 1
                    if validity == 0:
                        continue
        elif pproperty in validateQ11424:        
            if 'claims' in data['entities'][q]:
                if 'P31' in data['entities'][q]['claims']:
                    validity = 0
                    for m in data['entities'][q]['claims']['P31']:
                        if m['mainsnak']['datavalue']['value']['numeric-id'] == 11424:
                            validity = 1
                    if validity == 0:
                        continue
        elif pproperty in validateartwork:        
            if 'claims' in data['entities'][q]:
                if 'P31' in data['entities'][q]['claims']:
                    validity = 0
                    for m in data['entities'][q]['claims']['P31']:
                        if m['mainsnak']['datavalue']['value']['numeric-id'] == 3305213:
                            validity = 1
                    if validity == 0:
                        continue

        elif pproperty in validateQ16521:        
            if 'claims' in data['entities'][q]:
                if 'P31' in data['entities'][q]['claims']:
                    validity = 0
                    for m in data['entities'][q]['claims']['P31']:
                        if m['mainsnak']['datavalue']['value']['numeric-id'] == 16521:
                            validity = 1
                    if validity == 0:
                        continue
        elif pproperty in validateP31:        
            validity = 1
            if 'claims' in data['entities'][q]:
                if 'P31' in data['entities'][q]['claims']:
                    validity = 0
                if 'P279' in data['entities'][q]['claims']:
                    validity = 0
        elif pproperty in validateproperty:        
            validity = 1
            if 'claims' in data['entities'][q]:
                if pproperty in data['entities'][q]['claims']:
                    validity = 0
        else:
            validity = 1
        if (q in ignoreit) == True:
            validity = 0
        if validity == 1:
            label = q
            if 'labels' in data['entities'][q]:
                if 'en' in data['entities'][q]['labels']:
                    label = data['entities'][q]['labels']['en']['value']
                elif (sites.split(",")[0]) in data['entities'][q]['labels']:
                    label = data['entities'][q]['labels'][(sites.split(",")[0])]['value']
            if (((' ' in label) == True) or (pproperty != 'P735')):
                if (",fr," in sites) == True: sites= "fr," + sites.replace(",fr,", ",", 1)
                if (",commons," in sites) == True: sites= "commons," + sites.replace(",commons,", ",", 1)
                if (",en," in sites) == True: sites= "en," + sites.replace(",en,", ",", 1)
                sites = (sites[:68] + '..') if len(sites) > 68 else sites
                text += table_row.format(q,label.encode('utf-8'),cnt,sites)
                reportlength +=1
                skipped -=1
                if reportlength == 1: 
                    maxvalue = cnt
                    itemlist = q.replace("Q", "", 1)
                else:
                    itemlist = itemlist + q.replace("Q", ",", 1)
        if reportlength == maxreportlength:
            break
    minvalue = cnt
    return text, maxvalue, minvalue, reportlength, skipped, itemlist


def wikisite(sit):
    if sit in ('commonswiki', 'specieswiki', 'metawiki'):
        group = 'wikimedia'
        sdomain = sit.split('wiki')[0]
    elif sit in ('wikidatawiki', 'mediawikiwiki'):
        sdomain = 'www'
        group = sit.split('wiki')[0]
    else:
        sdomain, group = sit.split('wik')
        sdomain = sdomain.replace('_', '-')
        if sdomain == 'nb':
            sdomain = 'no'
        group = 'wik'+group
        if group == 'wiki':
            group = 'wikipedia'
    return sdomain, group


def makeNoClaimReport(db, sit, maxrlength, table_row, mincount, numberofclaims, reportmode):
    # FIXME: if numberofclaims>0 limit to pages in namespace 0 (article)
    if reportmode == "cat":
        hiddencat = ""
        if sit in ("frwiki"):
            #condition to be inserted in query1
            hiddencat = ' cl_to NOT IN ( SELECT page_title FROM ' + sit + '_p.page, ' + sit + '_p.page_props WHERE page_namespace = 14 AND pp_page = page_id  AND pp_propname = "hiddencat" ) AND '
        query1 = 'SELECT c.cl_to As cat, c.cnt As cnt, d.pp_value As QID FROM (SELECT cl_to, COUNT(*) AS cnt FROM ' + sit + '_p.categorylinks as cl, ' + sit + '_p.page_props As wpage_props WHERE ' + hiddencat + ' cl_from = wpage_props.pp_page AND wpage_props.pp_propname = "wikibase_item" AND wpage_props.pp_page = cl.cl_from AND cl_type = "page" AND wpage_props.pp_value IN ( SELECT page_title FROM page, page_props, wb_entity_per_page, wb_items_per_site WHERE ips_site_id = "' + sit + '" AND pp_page = epp_page_id AND pp_propname = "wb-claims" AND pp_value = "'+str(numberofclaims)+'" AND epp_entity_id = ips_item_id AND epp_entity_type = "item" AND page_id = pp_page) GROUP BY cl_to HAVING COUNT(*)>' + str(mincount) + ' ORDER BY COUNT(*) DESC) as c LEFT JOIN (SELECT page_title, pp_value FROM ' + sit + '_p.page, ' + sit + '_p.page_props WHERE page_namespace = 14 AND pp_page = page_id AND pp_propname = "wikibase_item" ) as d ON c.cl_to = d.page_title'
    else:
        query1 = 'SELECT REPLACE(tl_title, "_", " ") As tmpl, COUNT(*) As cnt, "" as QID FROM ' + sit + '_p.templatelinks, ' + sit + '_p.page_props As wpage_props WHERE tl_from = wpage_props.pp_page AND wpage_props.pp_propname = "wikibase_item" AND wpage_props.pp_page = tl_from AND wpage_props.pp_value IN ( SELECT page_title FROM page, page_props, wb_entity_per_page, wb_items_per_site WHERE ips_site_id = "' + sit + '" AND pp_page = epp_page_id AND pp_propname = "wb-claims" AND pp_sortkey = '+str(numberofclaims)+' AND epp_entity_id = ips_item_id AND epp_entity_type = "item" AND page_id = pp_page) GROUP BY tl_title HAVING COUNT(tl_title)>' + str(mincount) + ' ORDER BY COUNT(tl_title) DESC'
    reportlength = 0
    cursor = db.cursor()
    cursor.execute(query1)
    text = ''
    for cat, cnt, qid in cursor:
        lang, group = wikisite(sit)
        tresult = 1
        if sit == "enwiki":
            ca = ("Wikipedia_", "Articles_", "All_", "Use_", "Orphaned_", "CS1_", "Pages_", "Cleanup_tagged", "EngvarB_from", "Uncategorized_from")
            for c in ca:
                if cat.startswith(c):
                    tresult = 0
        elif sit == "itwiki":
            p = re.match('Senza_fonti_-_.*_\d\d\d\d',cat)
            if p:
                tresult = 0
        elif sit == "plwiki":
            p = re.match('Artyku.y_wymagaj.ce_uzupe.nienia_.r.de._od_\d\d\d\d.*',cat)
            if p:
                tresult = 0
        elif sit == "jawiki":
            p = re.match(u'出典を必要とする記事.*',cat)
            if p:
                tresult = 0
        elif sit == "ptwiki":
            p = re.match('!Artigos_que_carecem_de_fontes_desde.*',cat)
            if p:
                tresult = 0
            p = re.match('!Artigos_sem_interwiki_desde_.*',cat)
            if p:
                tresult = 0
        if tresult == 1:
            reportlength +=1
            text += table_row.format(cat.replace("_"," "), cnt, lang, group, qid, reportlength)
        if reportlength == maxrlength:
            break
    return text, reportlength


def main():
    db = MySQLdb.connect(host="wikidatawiki.labsdb",db="wikidatawiki_p", read_default_file="~/replica.my.cnf")

    noclaims = ("cywiki", "iswiki", "ocwiki", "ltwiki", "nowiki", "ptwiki", "frwiki", "svwiki", "rowiki", "lvwiki", "nlwiki", "ruwiki", "ukwiki", "plwiki", "cswiki", "skwiki", "enwiki", "dewiki", "itwiki", "zhwiki", "eswiki", "fiwiki", "huwiki", "trwiki", "specieswiki", "dawiki", "eowiki", "etwiki", "cawiki", "nnwiki", "jawiki")
    for sit in noclaims:
        table_row = '{{{{TR noclaims site|{0}|{1}|{2}|{3}|{4}|{5}}}}}\n'
        maxrlength = 2000
        mincount = 50
        numberofclaims = 0
        reportname = "Wikidata:Database reports/items without claims categories/" 
        if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "nlwiki", "cswiki", "skwiki", "specieswiki", "huwiki", "cawiki", "nnwiki", "eowiki")): mincount = 10
        if (sit in ("enwiki")): maxrlength = 3500
        report, reportlength = makeNoClaimReport(db,sit,maxrlength,table_row,mincount,numberofclaims, "cat")
        page = pywikibot.Page(site, reportname + sit)
        stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s}}\n' %(sit, reportlength, mincount)
        header = '{{{{' + reportname +'header|{0}}}}}<onlyinclude>\n'
        footer = '</onlyinclude>{{' + reportname + 'footer}} __NOINDEX__'
        text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
        summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s' %(sit, reportlength, mincount)
        page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)


    noclaims = ("cywiki", "iswiki", "ocwiki", "ltwiki", "nowiki", "itwiki")
    for sit in noclaims:
        table_row = '{{{{TR noclaims site|{0}|{1}|{2}|{3}|{4}|{5}}}}}\n'
        maxrlength = 2000
        mincount = 50
        numberofclaims = 1
        reportname = "Wikidata:Database reports/items with " + str(numberofclaims) + " statement categories/" 
        if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "nlwiki", "cswiki", "skwiki")): mincount = 10
        if (sit in ("frwiki")): mincount = 30
        if (sit in ("enwiki")): maxrlength = 3500
        report, reportlength = makeNoClaimReport(db,sit,maxrlength,table_row,mincount,numberofclaims, "cat")
        page = pywikibot.Page(site, reportname + sit)
        stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s}}\n' %(sit, reportlength, mincount)
        header = '{{{{' + reportname + 'header|{0}}}}}<onlyinclude>\n'
        footer = '</onlyinclude>{{' + reportname + 'footer}} __NOINDEX__'
        text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
        summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s' %(sit, reportlength, mincount)
        page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)

    noclaimstemplate = ("rowiki", "lvwiki", "nlwiki", "ruwiki", "ukwiki", "cswiki", "skwiki", "dewiki", "svwiki", "enwiki", "frwiki", "ptwiki", "zhwiki", "jawiki", "cawiki")
    # noclaimstemplate = ("cywiki", "iswiki", "ltwiki", "ptwiki", "rowiki", "lvwiki", "nlwiki", "ruwiki", "ukwiki", "cswiki", "skwiki", "dewiki")
    for sit in noclaimstemplate:
        table_row = '{{{{TR noclaimstemplate site|{0}|{1}|{2}|{3}}}}}\n'
        maxrlength = 2000
        mincount = 50
        numberofclaims = 0
        reportname = "Wikidata:Database reports/templates and items with 0 claims/" 
        if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "svwiki", "nlwiki", "jawiki")): mincount = 10
        report, reportlength = makeNoClaimReport(db, sit, maxrlength, table_row, mincount, numberofclaims, "template")
        page = pywikibot.Page(site, reportname + sit)
        stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s|numberofclaims=%s}}\n' %(sit, reportlength, mincount, numberofclaims)
        header = '{{{{'+reportname+'header|{0}}}}}<onlyinclude>\n'
        footer = '</onlyinclude>{{'+reportname+'footer}} __NOINDEX__'
        text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
        summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s; numberofclaims: %s' %(sit, reportlength, mincount, numberofclaims)
        page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)

    noclaimstemplate = ("frwiki", "ptwiki", "zhwiki")
    for sit in noclaimstemplate:
        table_row = '{{{{TR 1 claimstemplate site|{0}|{1}|{2}|{3}}}}}\n'
        maxrlength = 2000
        mincount = 50
        numberofclaims = 1
        reportname = "Wikidata:Database reports/templates and items with 1 claim/" 
        if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "svwiki", "nlwiki")): mincount = 10
        report, reportlength = makeNoClaimReport(db, sit, maxrlength, table_row, mincount, numberofclaims, "template")
        page = pywikibot.Page(site, reportname + sit)
        stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s|numberofclaims=%s}}\n' %(sit, reportlength, mincount, numberofclaims)
        header = '{{{{'+reportname+'header|{0}}}}}<onlyinclude>\n'
        footer = '</onlyinclude>{{'+reportname+'footer}} __NOINDEX__'
        text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
        summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s; numberofclaims: %s' %(sit, reportlength, mincount, numberofclaims)
        page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)

    for pproperty in properties:
        report, maxvalue, minvalue, reportlength, skipped, itemlist = makeReport(db,pproperty)
        stat = '{{DR missing properties|property=%s|max=%s|min=%s|reportlength=%s|skipped=%s}}\n' %(pproperty, maxvalue, minvalue, reportlength, skipped)
        page = pywikibot.Page(site,'Wikidata:Database reports/top missing properties by number of sitelinks/'+pproperty)
        header = '{{{{Wikidata:Database reports/top missing properties by number of sitelinks/header|{0}|itemlist='+ itemlist +'}}}}<onlyinclude>\n'
        footer = '</onlyinclude>{{Wikidata:Database reports/top missing properties by number of sitelinks/footer}} __NOINDEX__'
        text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
        summary = 'Bot: Updating database report: property %s; max: %s; min: %s; reportlength: %s; skipped: %s' %(pproperty, maxvalue, minvalue, reportlength, skipped)
        page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)


if __name__ == "__main__":
    main()