# =========================================================================== # eXe # Copyright 2004-2006, University of Auckland # Copyright 2006-2008 eXe Project, http://eXeLearning.org/ # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # =========================================================================== """ A Wikipedia Idevice is one built from a Wikipedia article. """ import re from exe.engine.beautifulsoup import BeautifulSoup from exe.engine.idevice import Idevice from exe.engine.field import TextAreaField from exe.engine.translate import lateTranslate from exe.engine.path import Path, TempDirPath from exe.engine.resource import Resource import urllib class UrlOpener(urllib.FancyURLopener): """ Set a distinctive User-Agent, so Wikipedia.org knows we're not spammers """ version = "eXe/exe@exelearning.org" urllib._urlopener = UrlOpener() import logging log = logging.getLogger(__name__) # =========================================================================== class WikipediaIdevice(Idevice): """ A Wikipedia Idevice is one built from a Wikipedia article. """ persistenceVersion = 8 def __init__(self, defaultSite): Idevice.__init__(self, x_(u"Wiki Article"), x_(u"University of Auckland"), x_(u"""

The Wikipedia iDevice allows you to locate existing content from within Wikipedia and download this content into your eXe resource. The Wikipedia Article iDevice takes a snapshot copy of the article content. Changes in Wikipedia will not automatically update individual snapshot copies in eXe, a fresh copy of the article will need to be taken. Likewise, changes made in eXe will not be updated in Wikipedia.

Wikipedia content is covered by the GNU free documentation license.

"""), u"", u"") self.emphasis = Idevice.NoEmphasis self.articleName = u"" self.article = TextAreaField(x_(u"Article")) self.article.idevice = self self.images = {} self.site = defaultSite self.icon = u"inter" self.systemResources += ["fdl.html"] self._langInstruc = x_(u"""Select the appropriate language version of Wikipedia to search and enter search term.""") self._searchInstruc = x_("""Enter a phrase or term you wish to search within Wikipedia.""") self.ownUrl = "" # Properties langInstruc = lateTranslate('langInstruc') searchInstruc = lateTranslate('searchInstruc') def loadArticle(self, name): """ Load the article from Wikipedia """ self.articleName = name url = "" name = urllib.quote(name.replace(" ", "_").encode('utf-8')) try: url = (self.site or self.ownUrl) if not url.endswith('/') and name <> '': url += '/' if '://' not in url: url = 'http://' + url url += name net = urllib.urlopen(url) page = net.read() net.close() except IOError, error: log.warning(unicode(error)) self.article.content = _(u"Unable to download from %s
Please check the spelling and connection and try again.") % url self.article.content_w_resourcePaths = self.article.content self.article.content_wo_resourcePaths = self.article.content return page = unicode(page, "utf8") # FIXME avoid problems with numeric entities in attributes page = page.replace(u' ', u' ') # avoidParserProblems is set to False because BeautifulSoup's # cleanup was causing a "concatenating Null+Str" error, # and Wikipedia's HTML doesn't need cleaning up. # BeautifulSoup is faster this way too. soup = BeautifulSoup(page, False) content = soup.first('div', {'id': "content"}) # remove the wiktionary, wikimedia commons, and categories boxes # and the protected icon and the needs citations box if content: infoboxes = content.findAll('div', {'class' : 'infobox sisterproject'}) [infobox.extract() for infobox in infoboxes] catboxes = content.findAll('div', {'id' : 'catlinks'}) [catbox.extract() for catbox in catboxes] amboxes = content.findAll('table', {'class' : re.compile(r'.*\bambox\b.*')}) [ambox.extract() for ambox in amboxes] protecteds = content.findAll('div', {'id' : 'protected-icon'}) [protected.extract() for protected in protecteds] else: content = soup.first('body') if not content: log.error("no content") self.article.content = _(u"Unable to download from %s
Please check the spelling and connection and try again.") % url # set the other elements as well self.article.content_w_resourcePaths = self.article.content self.article.content_wo_resourcePaths = self.article.content return # clear out any old images while self.userResources: self.userResources[0].delete() self.images = {} # Download the images bits = url.split('/') netloc = '%s//%s' % (bits[0], bits[2]) path = '/'.join(bits[3:-1]) tmpDir = TempDirPath() for imageTag in content.fetch('img'): imageSrc = unicode(imageTag['src']) imageName = imageSrc.split('/')[-1] # Search if we've already got this image if imageName not in self.images: if not imageSrc.startswith("http://"): if imageSrc.startswith("/"): imageSrc = netloc + imageSrc else: imageSrc = '%s/%s/%s' % (netloc, path, imageSrc) urllib.urlretrieve(imageSrc, tmpDir/imageName) new_resource = Resource(self, tmpDir/imageName) if new_resource._storageName != imageName: # looks like it was changed due to a possible conflict, # so reset the imageName accordingly for the content: imageName = new_resource._storageName self.images[imageName] = True # We have to use absolute URLs if we want the images to # show up in edit mode inside FCKEditor imageTag['src'] = (u"/" + self.parentNode.package.name + u"/resources/" + imageName) self.article.content = self.reformatArticle(netloc, unicode(content)) # now that these are supporting images, any direct manipulation # of the content field must also store this updated information # into the other corresponding fields of TextAreaField: # (perhaps eventually a property should be made for TextAreaField # such that these extra set's are not necessary, but for now, here:) self.article.content_w_resourcePaths = self.article.content self.article.content_wo_resourcePaths = self.article.content def reformatArticle(self, netloc, content): """ Changes links, etc """ content = re.sub(r'href="/', r'href="%s/' % netloc, content) content = re.sub(r'<(span|div)\s+(id|class)="(editsection|jump-to-nav)".*?', '', content) #TODO Find a way to remove scripts without removing newlines content = content.replace("\n", " ") content = re.sub(r'', '', content) return content def getResourcesField(self, this_resource): """ implement the specific resource finding mechanism for this iDevice: """ # be warned that before upgrading, this iDevice field could not exist: if hasattr(self, 'article') and hasattr(self.article, 'images'): for this_image in self.article.images: if hasattr(this_image, '_imageResource') \ and this_resource == this_image._imageResource: return self.article # NOTE that WikipediaIdevices list their images # in the idevice's .userResources, not in its .article.images... # a slightly different (and earlier) approach to embedding images: for this_image in self.userResources: if this_resource == this_image: return self.article return None def getRichTextFields(self): """ Like getResourcesField(), a general helper to allow nodes to search through all of their fields without having to know the specifics of each iDevice type. """ fields_list = [] if hasattr(self, 'article'): fields_list.append(self.article) return fields_list def burstHTML(self, i): """ takes a BeautifulSoup fragment (i) and bursts its contents to import this idevice from a CommonCartridge export """ # Wiki Article Idevice: # option title for Wikipedia, with mode emphasis: title = i.find(name='span', attrs={'class' : 'iDeviceTitle' }) if title is not None: self.title = title.renderContents().decode('utf-8') self.emphasis=Idevice.SomeEmphasis wiki = i.find(name='div', attrs={'id' : re.compile('^ta') }) self.article.content_wo_resourcePaths = \ wiki.renderContents().decode('utf-8') # and add the LOCAL resource paths back in: self.article.content_w_resourcePaths = \ self.article.MassageResourceDirsIntoContent( \ self.article.content_wo_resourcePaths) self.article.content = self.article.content_w_resourcePaths site = i.find(name='div', attrs={'class' : 'wiki_site' }) if site is not None: self.site = site.attrMap['value'].decode('utf-8') name = i.find(name='div', attrs={'class' : 'article_name' }) if name is not None: # WARNING: the following crashes on accented characters, eg: # 'ascii' codec can't encode character u'\xe8' in # position 11: ordinal not in range(128) self.articleName = name.attrMap['value'].decode('utf-8') own_url = i.find(name='div', attrs={'class' : 'own_url' }) if own_url is not None: self.own_url = own_url.attrMap['value'].decode('utf-8') def __getstate__(self): """ Re-write the img URLs just in case the class name has changed """ log.debug("in __getstate__ " + repr(self.parentNode)) # need to check parentNode because __getstate__ is also called by # deepcopy as well as Jelly. if self.parentNode: self.article.content = re.sub(r'/[^/]*?/resources/', u"/" + self.parentNode.package.name + u"/resources/", self.article.content) return Idevice.__getstate__(self) def delete(self): """ Clear out any old images when this iDevice is deleted """ self.images = {} Idevice.delete(self) def upgradeToVersion1(self): """ Called to upgrade from 0.6 release """ self.site = _('http://en.wikipedia.org/') def upgradeToVersion2(self): """ Upgrades v0.6 to v0.7. """ self.lastIdevice = False def upgradeToVersion3(self): """ Upgrades exe to v0.10 """ self._upgradeIdeviceToVersion1() self._site = self.__dict__['site'] def upgradeToVersion4(self): """ Upgrades exe to v0.11... what was I thinking? """ self.site = self.__dict__['_site'] def upgradeToVersion5(self): """ Upgrades exe to v0.11... forgot to change the icon """ self.icon = u"inter" def upgradeToVersion6(self): """ Upgrades to v0.12 """ self._upgradeIdeviceToVersion2() self.systemResources += ["fdl.html"] if self.images and self.parentNode: for image in self.images: imageResource = Resource(self, Path(image)) def upgradeToVersion7(self): """ Upgrades to v0.12 """ self._langInstruc = x_(u"""Select the appropriate language version of Wikipedia to search and enter search term.""") self._searchInstruc = x_("""Enter a phrase or term you wish to search within Wikipedia.""") def upgradeToVersion8(self): """ Upgrades to v0.19 """ self.ownUrl = "" # ===========================================================================