#!/usr/bin/python # -*- coding: utf-8 -*- """ Script to help a human solve disambiguations by presenting a set of options. Specify the disambiguation page on the command line, or enter it at the prompt after starting the program. (If the disambiguation page title starts with a '-', you cannot name it on the command line, but you can enter it at the prompt.) The program will pick up the page, and look for all alternative links, and show them with a number adjacent to them. It will then automatically loop over all pages referring to the disambiguation page, and show 30 characters of context on each side of the reference to help you make the decision between the alternatives. It will ask you to type the number of the appropriate replacement, and perform the change. It is possible to choose to replace only the link (just type the number) or replace both link and link-text (type 'r' followed by the number). Multiple references in one page will be scanned in order, but typing 'n' (next) on any one of them will leave the complete page unchanged. To leave only some reference unchanged, use the 's' (skip) option. Command line options: -pos:XXXX adds XXXX as an alternative disambiguation -just only use the alternatives given on the command line, do not read the page for other possibilities -primary "primary topic" disambiguation (Begriffsklärung nach Modell 2). That's titles where one topic is much more important, the disambiguation page is saved somewhere else, and the important topic gets the nice name. -primary:XY like the above, but use XY as the only alternative, instead of searching for alternatives in [[Keyword (disambiguation)]]. Note: this is the same as -primary -just -pos:XY -file:XYZ reads a list of pages from a text file. XYZ is the name of the file from which the list is taken. If XYZ is not given, the user is asked for a filename. Page titles should be inside [[double brackets]]. The -pos parameter won't work if -file is used. -always:XY instead of asking the user what to do, always perform the same action. For example, XY can be "r0", "u" or "2". Be careful with this option, and check the changes made by the bot. Note that some choices for XY don't make sense and will result in a loop, e.g. "l" or "m". -main only check pages in the main namespace, not in the talk, wikipedia, user, etc. namespaces. To complete a move of a page, one can use: python solve_disambiguation.py -just -pos:New_Name Old_Name """ # # (C) Rob W.W. Hooft, 2003 # (C) Daniel Herding, 2004 # (C) Andre Engels, 2003-2004 # (C) WikiWichtel, 2004 # # Distributed under the terms of the MIT license. # __version__='$Id: solve_disambiguation.py,v 1.213 2006/03/12 20:14:01 wikipedian Exp $' # # Standard library imports import re, sys, codecs # Application specific imports import wikipedia, pagegenerators # This is a purely interactive robot. We set the delays lower. wikipedia.put_throttle.setDelay(4) # Summary message when working on disambiguation pages msg = { 'en': u'Robot-assisted disambiguation: %s', 'da': u'Retter flertydigt link til: %s', 'de': u'Bot-unterstützte Begriffsklärung: %s', 'ia': u'Disambiguation assistite per robot: %s', 'nl': u'Robot-geholpen doorverwijzing: %s', 'fr': u'Homonymie résolue à l\'aide du robot: %s', 'pt': u'Desambiguação assistida por bot: %s', 'he': u'פתרון הפניה לפירושונים על ידי בוט: %s', 'ru': u'Разрешение значений с помощью бота: %s', 'sr': u'Решавање вишезначних одредница помоћу бота: %s', } # Summary message when working on redirects msg_redir = { 'en': u'Robot-assisted disambiguation: %s', 'da': u'Retter flertydigt link til: %s', 'de': u'Bot-unterstützte Redirectauflösung: %s', 'ia': u'Resolution de redirectiones assistite per robot: %s', 'nl': u'Robot-geholpen redirect-oplossing: %s', 'fr': u'Correction de lien vers redirect: %s', 'pt': u'Desambiguação assistida por bot: %s', 'he': u'פתרון הפניה לפירושונים על ידי בוט: %s', 'ru': u'Разрешение значений с помощью бота: %s', 'sr': u'Решавање вишезначних одредница помоћу бота: %s', } # disambiguation page name format for "primary topic" disambiguations # (Begriffsklärungen nach Modell 2) primary_topic_format = { 'de': u'%s_(Begriffsklärung)', 'en': u'%s_(disambiguation)', 'ia': u'%s_(disambiguation)', 'nl': u'%s_(doorverwijspagina)', 'pt': u'%s_(desambiguação)', 'he': u'%s_(פירושונים)', 'ru': u'%s_(значения)', 'sr': u'%s_(вишезначна одредница)', } # List pages that will be ignored if they got a link to a disambiguation # page. An example is a page listing disambiguations articles. # Special chars should be encoded with unicode (\x##) and space used # instead of _ ignore_title = { 'wikipedia': { 'da': [ u'Wikipedia:Links til sider med flertydige titler' ], 'de': [ u'Benutzer:Katharina/Begriffsklärungen', u'Benutzer:Noisper/Dingliste/[A-Z]', u'Benutzer:SirJective/.+', u'Benutzer Diskussion:.+', u'GISLexikon \([A-Z]\)', u'Lehnwort', u'Wikipedia:Archiv:.+', u'Wikipedia:Artikelwünsche/Ding-Liste/[A-Z]', u'Wikipedia:Begriffsklärung.*', u'Wikipedia:Dreibuchstabenkürzel von [A-Z][A-Z][A-Z] bis [A-Z][A-Z][A-Z]', u'Wikipedia:Interwiki-Konflikte', u'Wikipedia:Kurze Artikel', u'Wikipedia:Liste aller 2-Buchstaben-Kombinationen', u'Wikipedia:Liste mathematischer Themen/BKS', u'Wikipedia:Liste mathematischer Themen/Redirects', u'Wikipedia:Löschkandidaten/.+', u'Wikipedia:Qualitätsoffensive/UNO', #requested by Benutzer:Addicted u'Wikipedia:WikiProjekt Altertumswissenschaft/.+' ], 'en': [ u'Wikipedia:Links to disambiguating pages', u'Wikipedia:Disambiguation pages with links', u'Wikipedia:Multiple-place names \([A-Z]\)', u'Wikipedia:Non-unique personal name', u"User:Jerzy/Disambiguation Pages i've Editted", u'User:Gareth Owen/inprogress', u'TLAs from [A-Z][A-Z][A-Z] to [A-Z][A-Z][A-Z]', u'List of all two-letter combinations', u'User:Daniel Quinlan/redirects.+', u'User:Oliver Pereira/stuff', u'Wikipedia:French Wikipedia language links', u'Wikipedia:Polish language links', u'Wikipedia:Undisambiguated abbreviations/.+', u'List of acronyms and initialisms', u'Wikipedia:Usemod article histories', u'User:Pizza Puzzle/stuff', u'List of generic names of political parties', u'Talk:List of initialisms/marked', u'Talk:List of initialisms/sorted', u'Talk:Programming language', u'Talk:SAMPA/To do', u"Wikipedia:Outline of Roget's Thesaurus", u'User:Wik/Articles', u'User:Egil/Sandbox', u'Wikipedia talk:Make only links relevant to the context', u'Wikipedia:Common words, searching for which is not possible' ], 'fr': [ u'Wikipédia:Liens aux pages d\'homonymie', u'Wikipédia:Homonymie', u'Wikipédia:Homonymie/Homonymes dynastiques', u'Wikipédia:Prise de décision, noms des membres de dynasties/liste des dynastiens', u'Liste de toutes les combinaisons de deux lettres', u'Wikipédia:Log d\'upload/.*', u'Sigles de trois lettres de [A-Z]AA à [A-Z]ZZ', u'Wikipédia:Pages sans interwiki,.' ], 'fy': [ u'Wikipedy:Fangnet', ], 'ia': [ u'Categoria:Disambiguation', u'Wikipedia:.+', u'Usator:.+', u'Discussion Usator:.+', ], 'nl': [ u'Wikipedia:Onderhoudspagina', u'Wikipedia:Doorverwijspagina', u'Wikipedia:Lijst van alle tweeletter-combinaties', u'Gebruiker:Hooft/robot/Interwiki/lijst van problemen', u'Wikipedia:Woorden die niet als zoekterm gebruikt kunnen worden', u'Gebruiker:Puckly/Bijdragen', u'Gebruiker:Waerth/bijdragen', u"Wikipedia:Project aanmelding bij startpagina's", u'Gebruiker:Gustar/aantekeningen denotatie annex connotatie', u'Wikipedia:Protection log', u'Gebruiker:Pven/Romeinse cijfers', u'Categorie:Doorverwijspagina', u'Wikipedia:Ongelijke redirects', u'Gebruiker:Cars en travel', u'Wikipedia:Archief*', u'Overleg Wikipedia:Logboek*', u'Gebruiker:Rex/Gestarte artikelen', u'Gebruiker:Ucucha/Doorverwijspagina', u'Gebruiker:CyeZ/Klad2', u'Wikipedia:De kroeg/Archief.+', u'Overleg gebruiker:*Archief*', ], 'pt': [ u'Categoria:Desambiguação', u'Wikipedia:.+', u'Usuário:.+', u'Usuário Discussão:.+', ], 'ru': [ u'Категория:Disambig', u'Википедия:Страницы разрешения неоднозначностей', u'Википедия:Вики-уборка/Статьи без языковых ссылок', u'Википедия:Страницы с пометкой «(значения)»', u'Список общерусских фамилий', ], }, 'memoryalpha': { 'en': [ u'Memory Alpha:Links to disambiguating pages' ], 'de': [ u'Memory Alpha:Liste der Wortklärungsseiten' ], }, } class ReferringPageGeneratorWithIgnore: def __init__(self, disambPage, primary=False): self.disambPage = disambPage # if run with the -primary argument, enable the ignore manager self.primaryIgnoreManager = PrimaryIgnoreManager(disambPage, enabled=primary) def __iter__(self): refs = self.disambPage.getReferences(follow_redirects = False, withTemplateInclusion = False) wikipedia.output(u"Found %d references." % len(refs)) # Remove ignorables if ignore_title.has_key(self.disambPage.site().family.name) and ignore_title[self.disambPage.site().family.name].has_key(self.disambPage.site().lang): for ig in ignore_title[self.disambPage.site().family.name][self.disambPage.site().lang]: for i in range(len(refs)-1, -1, -1): if re.match(ig, refs[i].title()): wikipedia.output('Ignoring page %s' % refs[i].title()) del refs[i] elif self.primaryIgnoreManager.isIgnored(refs[i]): #wikipedia.output('Ignoring page %s because it was skipped before' % refs[i].title()) del refs[i] wikipedia.output(u"Will work on %d pages." % len(refs)) for ref in refs: yield ref class PrimaryIgnoreManager(object): ''' If run with the -primary argument, reads from a file which pages should not be worked on; these are the ones where the user pressed n last time. If run without the -primary argument, doesn't ignore any pages. ''' def __init__(self, disambPage, enabled = False): self.disambPage = disambPage self.enabled = enabled self.ignorelist = [] filename = 'disambiguations/' + self.disambPage.urlname() + '.txt' try: # The file is stored in the disambiguation/ subdir. Create if necessary. f = codecs.open(self.makepath(filename), 'r', 'utf-8') for line in f.readlines(): # remove trailing newlines and carriage returns while line[-1] in ['\n', '\r']: line = line[:-1] #skip empty lines if line != '': self.ignorelist.append(line) f.close() except IOError: pass def isIgnored(self, refpl): return self.enabled and refpl.urlname() in self.ignorelist def ignore(self, refpl): if self.enabled: # Skip this occurence next time. filename = 'disambiguations/' + self.disambPage.urlname() + '.txt' try: # Open file for appending. If none exists yet, create a new one. # The file is stored in the disambiguation/ subdir. Create if necessary. f = codecs.open(self.makepath(filename), 'a', 'utf-8') f.write(refpl.urlname() + '\n') f.close() except IOError: pass def makepath(self, path): """ creates missing directories for the given path and returns a normalized absolute version of the path. - if the given path already exists in the filesystem the filesystem is not modified. - otherwise makepath creates directories along the given path using the dirname() of the path. You may append a '/' to the path if you want it to be a directory path. from holger@trillke.net 2002/03/18 """ from os import makedirs from os.path import normpath,dirname,exists,abspath dpath = normpath(dirname(path)) if not exists(dpath): makedirs(dpath) return normpath(abspath(path)) class DisambiguationRobot(object): ignore_contents = { 'de':(u'{{[Ii]nuse}}', u'{{[Ll]öschen}}', ), 'ru':(u'{{[Ii]nuse}}', u'{{[Pp]rocessing}}', ), } def __init__(self, always, alternatives, getAlternatives, generator, primary, main_only): self.always = always self.alternatives = alternatives self.getAlternatives = getAlternatives self.generator = generator self.primary = primary self.main_only = main_only self.mysite = wikipedia.getSite() self.mylang = self.mysite.language() self.setupRegexes() def checkContents(self, text): ''' For a given text, returns False if none of the regular expressions given in the dictionary at the top of this class matches a substring of the text. Otherwise returns the substring which is matched by one of the regular expressions. ''' for ig in self.ignore_contents_regexes: match = ig.search(text) if match: return match.group() return None def makeAlternativesUnique(self): # remove duplicate entries result={} for i in self.alternatives: result[i]=None self.alternatives = result.keys() def listAlternativesGui(self): # list in new window, does not behave as expected, so not used currently. print '\n\t\t--> beachte neues Fenster <--' import gui list_window = gui.ListBoxWindow() list_window.list(self.alternatives) def listAlternatives(self): list = u'\n' for i in range(len(self.alternatives)): list += (u"%3i - %s\n" % (i, self.alternatives[i])) wikipedia.output(list) def setupRegexes(self): # compile regular expressions self.ignore_contents_regexes = [] if self.ignore_contents.has_key(self.mylang): for ig in self.ignore_contents[self.mylang]: self.ignore_contents_regexes.append(re.compile(ig)) linktrail = self.mysite.linktrail() self.trailR = re.compile(linktrail) # The regular expression which finds links. Results consist of four groups: # group title is the target page title, that is, everything before | or ]. # group section is the page section. It'll include the # to make life easier for us. # group label is the alternative link title, that's everything between | and ]. # group linktrail is the link trail, that's letters after ]] which are part of the word. # note that the definition of 'letter' varies from language to language. self.linkR = re.compile(r'\[\[(?P