User:ANU Outreachy/Outreachy 3

From Wikidata
Jump to navigation Jump to search
import pywikibot
import re
import requests
from bs4 import BeautifulSoup 

enwiki = pywikibot.Site('en', 'wikipedia')
enwiki_repo = enwiki.data_repository()

def findInception(page):  #for finding information from Category sections
	found = re.findall(r'(?im)\[\[\s*Category\s*:\s*(\d+) software\s*[\|\]]', page.text)	#regex expression to pick out the year ofinception from Category section
	print('P571: ' + found[0] + ' software')

def findInstanceOf(page):     #for finding information from Category sections
	found = re.findall(r'(?im)\[\[\s*Category\s*:\s*(\w+) operating systems\s*[\|\]]', page.text)	#regex expression to pick out the 
	print('P31: ' + found[0] + ' operating systems')
	

def findprop(propertyname, propertyid, page): 
	string = r'\|\s*%s\s+\=\s*.+'% propertyname	#form regex expression to find items from infobox
	found_items = re.findall(string, page.text)	#save found items which match RE above
	if found_items:
		split_items = found_items[0].split('=')[1].split('[[')	#split items found
		items = []
		for item in split_items:
                	item = item.strip() 		#trim spaces
                	items = re.findall(r'[\w\s]+[^\],][\w\s\.\|\)\-\,]+', item)	#find and store the information present within brackets
		print(propertyid + ": " + items[len(items)-1])	#print property id and property value

		

page = pywikibot.Page(enwiki, "Chromium_(web_browser)")
print(page)
findInception(page)			#find inception of software
findprop('screenshot', 'P18', page)     #find image link 
findprop('logo', 'P154', page)          #find logo image link 
findprop('author', 'P178', page)        #find developed by 
findprop('author', 'P112', page)        #find founded by
findprop('name', 'P138', page)          #find named after 

print("\n")

page2 = pywikibot.Page(enwiki, "DragonFly_BSD")
print(page2)
findInception(page2)			#find inception of software
findInstanceOf(page2)			#find instance of value
findprop('website', 'P856', page2)	#find website link
findprop('logo', 'P18', page2)		#find logo image link 
findprop('developer', 'P178', page2)    #find developer 
findprop('developer', 'P112', page2)    #find founded by



def findPropertyFromWikidata( item, propertyid ):
	item_dict = item.get()		#get the ItemPage in Dictionary form
	try:
		item_list = item_dict["claims"][propertyid]	#access the property information from the dictionary under "claims"
		if item_list:
			for i in item_list:
				it = i.getTarget()	#returns a WbQuantity object with methods 
				
				print(propertyid + ": ")
				try:
					ii = it.get()		#convert to dictionary form if possible
					print(ii['labels']['en'])	#print the name from the dictionary 
				except:
					print(it)	#print the Wbquantity object only
				print("\n")
	except:
		print(propertyid + ": Not Found\n")	#If the propertyid is not found


		

"""BONUS TASK"""

print("\n\nPrinting the same properties directly from wikidata\n")

site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
item = pywikibot.ItemPage(repo, "Q48524")

item_dict = item.get()		#get the ItemPage in Dictionary form
print(item_dict['labels']['uk'])	#access the name of page 
findPropertyFromWikidata(item, "P571")
findPropertyFromWikidata(item, "P18")
findPropertyFromWikidata(item, "P154")
findPropertyFromWikidata(item, "P178")
findPropertyFromWikidata(item, "P112")
findPropertyFromWikidata(item, "P138")


item2 = pywikibot.ItemPage(repo, "Q751067")
item2_dict = item2.get()	#get the ItemPage in dictionary form
print(item2_dict['labels']['uk'])	#access the name of the page
findPropertyFromWikidata(item2, "P571")
findPropertyFromWikidata(item2, "P31")
findPropertyFromWikidata(item2, "P856")
findPropertyFromWikidata(item2, "P18")
findPropertyFromWikidata(item2, "P178")
findPropertyFromWikidata(item2, "P112")


What I Learnt[edit]

1. Learnt to write RE in python.

2. Learnt the use of search(), findall(), compile(), split(), sub(), subn(), escape() functions of 're'.

3. Learnt to make dictionary from page information and access that information.


My Observations[edit]

1. For finding values from "Categories" I needed to change the code a little time depending on the way the values were mentioned in a page.

2. Finding values from infobox was comparatively easy and works well for most pages if the property exists. Still, one would have to check for the titles in the infobox.