User:ANU Outreachy/Outreachy 5

From Wikidata
Jump to navigation Jump to search
# Import modules
import pywikibot
from pywikibot import pagegenerators
from pywikibot.data import api
import numpy as np
import requests
from importlib import reload
import re

site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
enwp_site = 'enwiki'
prefix = 'en'

def search_entities(site, itemtitle):
    params = {'action': 'wbsearchentities',
              'format': 'json',
              'language': 'en',
              'type': 'item',
              'search': itemtitle}
    request = api.Request(site=site, parameters=params)
    return request.submit()


def findCorrectQID(wikidataEntries, text):
    if wikidataEntries['search'] != []:
        results = wikidataEntries['search']
        numresults = len(results)
        for i in range(0, numresults):
            qid = results[i]['id']		
            label = results[i]['label']
            curr_url = results[i]['title']
            desc = results[i]['description']
            match  = results[i]['match']
            
            if match['language'] == 'en' and match['type'] == 'label' and match['text'] == text and desc != 'Wikimedia disambiguation page' :		#checking if language is english, type is label and text is exactly same as the searched text
                print(qid + "- " + label + "\nDescription: " + str(desc) +"\n")		#printing QID, label and Description for the term searched for in Wikidata
          


list_of_text = ["BSD", "free operating system", 'software', "Matthew Dillon", "FreeBSD", "x86_64", "FreeBSD ports"]

list_of_text2 = ["free software", "Microsoft Windows"]
for text in list_of_text:
	wikidataEntries = search_entities(repo, text)	
	findCorrectQID(wikidataEntries, text)


""""For Unconnected pages"""
#Following https://bitbucket.org/mikepeel/wikicode/src/master/enwp_find_wikidata.py


def unconnected_pages(query):
    enwp = pywikibot.Site('en', 'wikipedia')
    enwd = pywikibot.Site('wikidata', 'wikidata')
    targetcats = ['Category:Articles_without_Wikidata_item']
    for targetcat in targetcats:
        cat = pywikibot.Category(enwp, targetcat)
	# pages = pagegenerators.CategorizedPageGenerator(cat, recurse=False)
        pages = enwp.querypage('UnconnectedPages')
        for page in pages:
            if query == page.title():
                print("\n" + "http://en.wikipedia.org/wiki/"+page.title().replace(' ','_'))
                if 'Articles for deletion' in page.title():
                    continue
                if page.isRedirectPage():
                    continue
                try:
                    wd_item = pywikibot.ItemPage.fromPage(page)
                    item_dict = wd_item.get()
                    qid = wd_item.title()
                    print("Has a sitelink already - " + qid)
                    continue
                except:
                    # If that didn't work, go no further
                    print(page.title() + ' - no page found')
                    wd_item = 0
                    item_dict = 0
                    qid = 0
                    sitelink_check = 0
                    # continue

		# If we're here, then we don't have one, see if we can add it through the commons category
                searchtag = page.title()
                try:
                    searchtag = searchtag.split('(')[0].strip()
                except:
                    null = 0
                wikidataEntries = search_entities(repo, searchtag)
		# print(wikidataEntries)
                data = {'sitelinks': [{'site': enwp_site, 'title': page.title()}]}
		# print(wikidataEntries['searchinfo'])
                done = 0
                if wikidataEntries['search'] != []:
                    results = wikidataEntries['search']
		    # prettyPrint(results)
                    numresults = len(results)
                    if numresults > 5:
                        print('More than 5 candidates, bot would skip')
                    for i in range(0,numresults):
                        if done != 0:
                            continue
                        targetpage = pywikibot.ItemPage(site, results[i]['id'])
                        try:
                            item_dict = targetpage.get()
                        except:
                            continue
				# print(item_dict)
                        sitelink = ''
                        try:
                            sitelink = get_sitelink_title(item_dict['sitelinks'][enwp_site])
                        except:
                            null = 0
                        if sitelink == '':
                            print('http://www.wikidata.org/wiki/'+results[i]['id'])
                            try:
                                 print(item_dict['labels']['en'])
                            except:
                                 print('')
                            try:
                                 print(item_dict['descriptions']['en'])
                            except:
                                 print('')
                            print('http://'+prefix+'.wikipedia.org/wiki/' + page.title().replace(' ','_'))
                            text = input("Save? ")
                            if text != 'n':
                                targetpage.editEntity(data, summary=u'Add enwp sitelink')
                                done = 1



unconnected_pages('Anna Palmer')	#Search for "Anna Palmer" in Unconnected Pages - will find sitelink for this
unconnected_pages("Central Council for Research in Homoeopathy (CCRH)")		#Search for "Central Council for Research in Homoeopathy (CCRH)" in Unconnected Pages - will not find in page, have an option to save if link is found