User:Mbear/UpdatesNeededProgram

Requirements[edit]

I don't remember exactly. I wrote these two programs several years ago. I know you need Python and the BeautifulSoup module. I've only run this via Python 2.7, but it might work under Python 3.

Updates Needed Builder[edit]

The following Python 2.7 program accesses the Updates Needed page, gets a list of links, then examines each of those pages to find the content of "Updates Needed" tag. This information is saved as a table row in an HTML table. Does NOT include characters, because I don't do biography pages. (See next heading.)

import re
import requests
from bs4 import BeautifulSoup

HTMLOpen = """<!DOCTYPE html>
<html>
<head>
    <title>Updates Needed List</title>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> 
    <style>
        body{font-family:"Arial";font-size:8pt;}
        table,tr,th,td{border:1px solid #000;border-collapse:collapse;padding:1px;}
        span.hardcopy {background-color:#5ddfff;}
        span.epub{background-color:#ffc0cb;}
    </style>
</head>
<body>
    <table>
        <thead>
            <tr>
                <th>Page</th>
                <th>Sources</th>
            </tr>
        </thead>
    """

HTMLLineStart = "<tr><td>"
HTMLLineEnd = "</td></tr>"
    
HTMLClose="""    </table>
</body>
</html>"""


rawstr = """(<b>Update Needed</b><br /><div style="font-size: 90%; text-align: center">This article needs to be updated with material from <i>)(.+)(</i>. Once this title clears the <a href="/wiki/Policy:Moratorium" title="Policy:Moratorium">Moratorium period</a>, or if it already has, please consider revisiting this article and updating it with the new material, removing this tag once all information has been added.)"""

UpdateNeededDict = {}

epublist = open('epublist-bare.txt','r').read().splitlines()  #needed to get each item from list without trailing newline characters
booklist = open('booklist.txt','r').read().splitlines()


UpdateURL = 'http://www.sarna.net/wiki/index.php?title=Special:WhatLinksHere/Template:Update_Needed&limit=1500'
wikibase="http://www.sarna.net"
headers = {
    'User-Agent':'Mbear\'s Update Needed Builder',
    'From':'pae@towerofjade.com'}

r = requests.get(UpdateURL, headers=headers)
text=r.text
soup=BeautifulSoup(text)
linklist=soup.select("ul#mw-whatlinkshere-list > li > a[href]")

UpdateList = open('UpdatesNeeded.html','w')
UpdateList.write(HTMLOpen)

characterCount = 0

try:
    for link in linklist:
         title=link.attrs['title']
         if ":" in title:
                print "examining {0}\tSKIPPING FILE - System file".format(title)
         else:
                print "examining {0}".format(title.encode('utf8',errors='ignore')),
                workinglink = wikibase + link.attrs['href']
                req=requests.get(workinglink)
                workingtext = req.text
                souptext=BeautifulSoup(workingtext)
                categorylist = souptext.select("div#mw-normal-catlinks > ul > li > a[title*=Characters]")
                if len(categorylist) == 0:
                    #print "no characters found"
                    compile_obj = re.compile(rawstr)
                    match_obj = compile_obj.search(workingtext)
                    newstr= BeautifulSoup(match_obj.group(2))
                    sl=[]
                    sourcelist = newstr.select("a[title]")
                    for source in sourcelist:
                        source = source.attrs['title'] #.encode('utf8')
                        #print source.attrs['title'] + ",",
                        if source.lower() in epublist:  #Need to add epublist and book list
                            source = "<span class='epub'>" + source + "</span>"
                            #print "Epub found: {0}".format(source)
                        elif source.lower() in booklist:
                            source = "<span class='hardcopy'>" + source + "</span>"
                            #print "Book found: {0}".format(source)
                        else:
                            source = source
                        #sl.append(source.attrs['title'].encode('utf8'))
                        sl.append(source.encode('utf8'))
                    UpdateNeededDict[title] = sl
                    print "\t({0} of {1})".format(len(UpdateNeededDict),len(linklist))
                else:
                    characterCount = characterCount + 1
                    print "\tSKIPPING FILE - Character: {0}".format(characterCount)
                
except: #KeyboardInterrupt:
    for k,v in sorted(UpdateNeededDict.items()):
        osl = ", ".join(v)
        writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
        UpdateList.write(writethis)

for k,v in sorted(UpdateNeededDict.items()):
    osl = ", ".join(v)
    writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
    UpdateList.write(writethis)

UpdateList.write("Character Count: {0}".format(characterCount))
UpdateList.write(HTMLClose)
UpdateList.close()

Updates Needed Builder People[edit]

This program builds a list of all characters/people who need to have some information added.

import re
import requests
from bs4 import BeautifulSoup

HTMLOpen = """<!DOCTYPE html>
<html>
<head>
    <title>Characters Updates Needed List</title>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> 
    <style>
        body{font-family:"Arial";font-size:8pt;}
        table,tr,th,td{border:1px solid #000;border-collapse:collapse;padding:1px;}
        span.hardcopy {background-color:#5ddfff;}
        span.epub{background-color:#ffc0cb;}
    </style>
</head>
<body>
    <table>
        <thead>
            <tr>
                <th>Page</th>
                <th>Sources</th>
            </tr>
        </thead>
    """

HTMLLineStart = "<tr><td>"
HTMLLineEnd = "</td></tr>"
    
HTMLClose="""    </table>
</body>
</html>"""


rawstr = """(<b>Update Needed</b><br /><div style="font-size: 90%; text-align: center">This article needs to be updated with material from <i>)(.+)(</i>. Once this title clears the <a href="/wiki/Policy:Moratorium" title="Policy:Moratorium">Moratorium period</a>, or if it already has, please consider revisiting this article and updating it with the new material, removing this tag once all information has been added.)"""

UpdateNeededDict = {}

epublist = open('epublist-bare.txt','r').read().splitlines()  #needed to get each item from list without trailing newline characters
booklist = open('booklist.txt','r').read().splitlines()


UpdateURL = 'http://www.sarna.net/wiki/index.php?title=Special:WhatLinksHere/Template:Update_Needed&limit=1500'
wikibase="http://www.sarna.net"
headers = {
    'User-Agent':'Mbear\'s Update Needed Builder',
    'From':'pae@towerofjade.com'}

r = requests.get(UpdateURL, headers=headers)
text=r.text
soup=BeautifulSoup(text)
linklist=soup.select("ul#mw-whatlinkshere-list > li > a[href]")

UpdateList = open('CharactersUpdateNeeded.html','w')
UpdateList.write(HTMLOpen)

counter = 0

try:
    for link in linklist:
         title=link.attrs['title']
         if ":" in title:
                print "examining {0}\tSKIPPING FILE - System file".format(title)
         else:
                print "examining {0}".format(title.encode('utf8',errors='ignore')),
                workinglink = wikibase + link.attrs['href']
                req=requests.get(workinglink)
                workingtext = req.text
                souptext=BeautifulSoup(workingtext)
                categorylist = souptext.select("div#mw-normal-catlinks > ul > li > a[title*=Characters]")
                if len(categorylist) == 0:
                    counter = counter + 1
                    print "\t({0} of {1})".format(counter,len(linklist))
                else:
                    compile_obj = re.compile(rawstr)
                    match_obj = compile_obj.search(workingtext)
                    newstr= BeautifulSoup(match_obj.group(2))
                    sl=[]
                    sourcelist = newstr.select("a[title]")
                    for source in sourcelist:
                        source = source.attrs['title'] #.encode('utf8')
                        #print source.attrs['title'] + ",",
                        if source.lower() in epublist:  #Need to add epublist and book list
                            source = "<span class='epub'>" + source + "</span>"
                            #print "Epub found: {0}".format(source)
                        elif source.lower() in booklist:
                            source = "<span class='hardcopy'>" + source + "</span>"
                            #print "Book found: {0}".format(source)
                        else:
                            source = source
                        #sl.append(source.attrs['title'].encode('utf8'))
                        sl.append(source.encode('utf8'))
                    UpdateNeededDict[title] = sl
                    print "\t({0} of {1})".format(len(UpdateNeededDict),len(linklist))
                
except: #KeyboardInterrupt:
    for k,v in sorted(UpdateNeededDict.items()):
        osl = ", ".join(v)
        writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
        UpdateList.write(writethis)

for k,v in sorted(UpdateNeededDict.items()):
    osl = ", ".join(v)
    writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
    UpdateList.write(writethis)

UpdateList.write(HTMLClose)
UpdateList.close()