User:Mbear/UpdatesNeededProgram
Requirements[edit]
I don't remember exactly. I wrote these two programs several years ago. I know you need Python and the BeautifulSoup module. I've only run this via Python 2.7, but it might work under Python 3.
Updates Needed Builder[edit]
The following Python 2.7 program accesses the Updates Needed page, gets a list of links, then examines each of those pages to find the content of "Updates Needed" tag. This information is saved as a table row in an HTML table. Does NOT include characters, because I don't do biography pages. (See next heading.)
import re import requests from bs4 import BeautifulSoup HTMLOpen = """<!DOCTYPE html> <html> <head> <title>Updates Needed List</title> <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> <style> body{font-family:"Arial";font-size:8pt;} table,tr,th,td{border:1px solid #000;border-collapse:collapse;padding:1px;} span.hardcopy {background-color:#5ddfff;} span.epub{background-color:#ffc0cb;} </style> </head> <body> <table> <thead> <tr> <th>Page</th> <th>Sources</th> </tr> </thead> """ HTMLLineStart = "<tr><td>" HTMLLineEnd = "</td></tr>" HTMLClose=""" </table> </body> </html>""" rawstr = """(<b>Update Needed</b><br /><div style="font-size: 90%; text-align: center">This article needs to be updated with material from <i>)(.+)(</i>. Once this title clears the <a href="/wiki/Policy:Moratorium" title="Policy:Moratorium">Moratorium period</a>, or if it already has, please consider revisiting this article and updating it with the new material, removing this tag once all information has been added.)""" UpdateNeededDict = {} epublist = open('epublist-bare.txt','r').read().splitlines() #needed to get each item from list without trailing newline characters booklist = open('booklist.txt','r').read().splitlines() UpdateURL = 'http://www.sarna.net/wiki/index.php?title=Special:WhatLinksHere/Template:Update_Needed&limit=1500' wikibase="http://www.sarna.net" headers = { 'User-Agent':'Mbear\'s Update Needed Builder', 'From':'pae@towerofjade.com'} r = requests.get(UpdateURL, headers=headers) text=r.text soup=BeautifulSoup(text) linklist=soup.select("ul#mw-whatlinkshere-list > li > a[href]") UpdateList = open('UpdatesNeeded.html','w') UpdateList.write(HTMLOpen) characterCount = 0 try: for link in linklist: title=link.attrs['title'] if ":" in title: print "examining {0}\tSKIPPING FILE - System file".format(title) else: print "examining {0}".format(title.encode('utf8',errors='ignore')), workinglink = wikibase + link.attrs['href'] req=requests.get(workinglink) workingtext = req.text souptext=BeautifulSoup(workingtext) categorylist = souptext.select("div#mw-normal-catlinks > ul > li > a[title*=Characters]") if len(categorylist) == 0: #print "no characters found" compile_obj = re.compile(rawstr) match_obj = compile_obj.search(workingtext) newstr= BeautifulSoup(match_obj.group(2)) sl=[] sourcelist = newstr.select("a[title]") for source in sourcelist: source = source.attrs['title'] #.encode('utf8') #print source.attrs['title'] + ",", if source.lower() in epublist: #Need to add epublist and book list source = "<span class='epub'>" + source + "</span>" #print "Epub found: {0}".format(source) elif source.lower() in booklist: source = "<span class='hardcopy'>" + source + "</span>" #print "Book found: {0}".format(source) else: source = source #sl.append(source.attrs['title'].encode('utf8')) sl.append(source.encode('utf8')) UpdateNeededDict[title] = sl print "\t({0} of {1})".format(len(UpdateNeededDict),len(linklist)) else: characterCount = characterCount + 1 print "\tSKIPPING FILE - Character: {0}".format(characterCount) except: #KeyboardInterrupt: for k,v in sorted(UpdateNeededDict.items()): osl = ", ".join(v) writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r' UpdateList.write(writethis) for k,v in sorted(UpdateNeededDict.items()): osl = ", ".join(v) writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r' UpdateList.write(writethis) UpdateList.write("Character Count: {0}".format(characterCount)) UpdateList.write(HTMLClose) UpdateList.close()
Updates Needed Builder People[edit]
This program builds a list of all characters/people who need to have some information added.
import re import requests from bs4 import BeautifulSoup HTMLOpen = """<!DOCTYPE html> <html> <head> <title>Characters Updates Needed List</title> <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> <style> body{font-family:"Arial";font-size:8pt;} table,tr,th,td{border:1px solid #000;border-collapse:collapse;padding:1px;} span.hardcopy {background-color:#5ddfff;} span.epub{background-color:#ffc0cb;} </style> </head> <body> <table> <thead> <tr> <th>Page</th> <th>Sources</th> </tr> </thead> """ HTMLLineStart = "<tr><td>" HTMLLineEnd = "</td></tr>" HTMLClose=""" </table> </body> </html>""" rawstr = """(<b>Update Needed</b><br /><div style="font-size: 90%; text-align: center">This article needs to be updated with material from <i>)(.+)(</i>. Once this title clears the <a href="/wiki/Policy:Moratorium" title="Policy:Moratorium">Moratorium period</a>, or if it already has, please consider revisiting this article and updating it with the new material, removing this tag once all information has been added.)""" UpdateNeededDict = {} epublist = open('epublist-bare.txt','r').read().splitlines() #needed to get each item from list without trailing newline characters booklist = open('booklist.txt','r').read().splitlines() UpdateURL = 'http://www.sarna.net/wiki/index.php?title=Special:WhatLinksHere/Template:Update_Needed&limit=1500' wikibase="http://www.sarna.net" headers = { 'User-Agent':'Mbear\'s Update Needed Builder', 'From':'pae@towerofjade.com'} r = requests.get(UpdateURL, headers=headers) text=r.text soup=BeautifulSoup(text) linklist=soup.select("ul#mw-whatlinkshere-list > li > a[href]") UpdateList = open('CharactersUpdateNeeded.html','w') UpdateList.write(HTMLOpen) counter = 0 try: for link in linklist: title=link.attrs['title'] if ":" in title: print "examining {0}\tSKIPPING FILE - System file".format(title) else: print "examining {0}".format(title.encode('utf8',errors='ignore')), workinglink = wikibase + link.attrs['href'] req=requests.get(workinglink) workingtext = req.text souptext=BeautifulSoup(workingtext) categorylist = souptext.select("div#mw-normal-catlinks > ul > li > a[title*=Characters]") if len(categorylist) == 0: counter = counter + 1 print "\t({0} of {1})".format(counter,len(linklist)) else: compile_obj = re.compile(rawstr) match_obj = compile_obj.search(workingtext) newstr= BeautifulSoup(match_obj.group(2)) sl=[] sourcelist = newstr.select("a[title]") for source in sourcelist: source = source.attrs['title'] #.encode('utf8') #print source.attrs['title'] + ",", if source.lower() in epublist: #Need to add epublist and book list source = "<span class='epub'>" + source + "</span>" #print "Epub found: {0}".format(source) elif source.lower() in booklist: source = "<span class='hardcopy'>" + source + "</span>" #print "Book found: {0}".format(source) else: source = source #sl.append(source.attrs['title'].encode('utf8')) sl.append(source.encode('utf8')) UpdateNeededDict[title] = sl print "\t({0} of {1})".format(len(UpdateNeededDict),len(linklist)) except: #KeyboardInterrupt: for k,v in sorted(UpdateNeededDict.items()): osl = ", ".join(v) writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r' UpdateList.write(writethis) for k,v in sorted(UpdateNeededDict.items()): osl = ", ".join(v) writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r' UpdateList.write(writethis) UpdateList.write(HTMLClose) UpdateList.close()