diff --git a/.gitignore b/.gitignore index 8dba2df3..6dde2f56 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ /_cache /_output /.venv +/data diff --git a/Dockerfile b/Dockerfile index d61009d9..c2280892 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,12 @@ -FROM ubuntu:bionic +FROM ubuntu:noble RUN apt-get update \ && apt-get install -y --no-install-recommends \ - python-pip python-setuptools python-wheel \ + python3-pip python3-setuptools python3-wheel \ locales tzdata \ ca-certificates \ strace gdb lsof locate net-tools htop iputils-ping dnsutils \ - python2.7-dbg python2.7 libpython2.7 python-dbg libpython-dbg \ + python3-dbg libpython3-dbg \ curl nano vim tree less telnet patch \ graphviz sqlite3 \ dumb-init \ @@ -25,9 +25,9 @@ WORKDIR /planet ENTRYPOINT ["dumb-init"] RUN echo "#!/bin/bash -eux \n\ -python2.7 code/planet.py config/config.ini \n\ +python3.12 code/planet.py config/config.ini \n\ cd /srv/planetpython.org/ \n\ -python2.7 -mSimpleHTTPServer 8080 \n\ +python3.12 -m http.server 8080 \n\ "> /start.sh RUN chmod +x /start.sh EXPOSE 8080 diff --git a/README.md b/README.md new file mode 100644 index 00000000..e69de29b diff --git a/code/planet-cache.py b/code/planet-cache.py index 9334583a..a5d86ddd 100755 --- a/code/planet-cache.py +++ b/code/planet-cache.py @@ -1,194 +1,228 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Planet cache tool. +#!/usr/bin/env python3 +"""Planet cache tool.""" -""" - -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = [ + "Scott James Remnant ", + "Jeff Waugh ", +] __license__ = "Python" +import argparse +import configparser import os +import shelve import sys import time -import dbhash -import ConfigParser import planet def usage(): - print "Usage: planet-cache [options] CACHEFILE [ITEMID]..." - print - print "Examine and modify information in the Planet cache." - print - print "Channel Commands:" - print " -C, --channel Display known information on the channel" - print " -L, --list List items in the channel" - print " -K, --keys List all keys found in channel items" - print - print "Item Commands (need ITEMID):" - print " -I, --item Display known information about the item(s)" - print " -H, --hide Mark the item(s) as hidden" - print " -U, --unhide Mark the item(s) as not hidden" - print - print "Other Options:" - print " -h, --help Display this help message and exit" + print("Usage: planet-cache [options] CACHEFILE [ITEMID]...") + print() + print("Examine and modify information in the Planet cache.") + print() + print("Channel Commands:") + print(" -C, --channel Display known information on the channel") + print(" -L, --list List items in the channel") + print(" -K, --keys List all keys found in channel items") + print() + print("Item Commands (need ITEMID):") + print(" -I, --item Display known information about the item(s)") + print(" -H, --hide Mark the item(s) as hidden") + print(" -U, --unhide Mark the item(s) as not hidden") + print() + print("Other Options:") + print(" -h, --help Display this help message and exit") sys.exit(0) + def usage_error(msg, *args): - print >>sys.stderr, msg, " ".join(args) - print >>sys.stderr, "Perhaps you need --help ?" + print(msg, " ".join(args), file=sys.stderr) + print("Perhaps you need --help ?", file=sys.stderr) sys.exit(1) + def print_keys(item, title): keys = item.keys() - keys.sort() - key_len = max([ len(k) for k in keys ]) + key_len = max([len(k) for k in sorted(keys)]) - print title + ":" - for key in keys: + print(title + ":") + for key in sorted(keys): if item.key_type(key) == item.DATE: value = time.strftime(planet.TIMEFMT_ISO, item[key]) else: value = str(item[key]) - print " %-*s %s" % (key_len, key, fit_str(value, 74 - key_len)) + print(" %-*s %s" % (key_len, key, fit_str(value, 74 - key_len))) + def fit_str(string, length): if len(string) <= length: return string else: - return string[:length-4] + " ..." + return string[: length - 4] + " ..." if __name__ == "__main__": - cache_file = None - want_ids = 0 ids = [] - command = None - - for arg in sys.argv[1:]: - if arg == "-h" or arg == "--help": - usage() - elif arg == "-C" or arg == "--channel": - if command is not None: - usage_error("Only one command option may be supplied") - command = "channel" - elif arg == "-L" or arg == "--list": - if command is not None: - usage_error("Only one command option may be supplied") - command = "list" - elif arg == "-K" or arg == "--keys": - if command is not None: - usage_error("Only one command option may be supplied") - command = "keys" - elif arg == "-I" or arg == "--item": - if command is not None: - usage_error("Only one command option may be supplied") - command = "item" - want_ids = 1 - elif arg == "-H" or arg == "--hide": - if command is not None: - usage_error("Only one command option may be supplied") - command = "hide" - want_ids = 1 - elif arg == "-U" or arg == "--unhide": - if command is not None: - usage_error("Only one command option may be supplied") - command = "unhide" - want_ids = 1 - elif arg.startswith("-"): - usage_error("Unknown option:", arg) - else: - if cache_file is None: - cache_file = arg - elif want_ids: - ids.append(arg) - else: - usage_error("Unexpected extra argument:", arg) - - if cache_file is None: + parser = argparse.ArgumentParser( + description="Examine and modify information in the Planet cache." + ) + parser.add_argument( + "-C", + "--channel", + action="store_const", + const="channel", + dest="command", + help="Display known information on the channel", + ) + parser.add_argument( + "-L", + "--list", + action="store_const", + const="list", + dest="command", + help="List items in the channel", + ) + parser.add_argument( + "-K", + "--keys", + action="store_const", + const="keys", + dest="command", + help="List all keys found in channel items", + ) + parser.add_argument( + "-I", + "--item", + action="store_const", + const="item", + dest="command", + help="Display known information about the item(s)", + ) + parser.add_argument( + "-H", + "--hide", + action="store_const", + const="hide", + dest="command", + help="Mark the item(s) as hidden", + ) + parser.add_argument( + "-U", + "--unhide", + action="store_const", + const="unhide", + dest="command", + help="Mark the item(s) as not hidden", + ) + parser.add_argument("cache_file", help="Cache file to operate on") + parser.add_argument( + "item_ids", + nargs="*", + help="Item IDs to operate on when using item-related commands", + ) + + args = parser.parse_args() + + # Check if more than one command option was supplied + if "command" not in args or args.command is None: + usage_error("One command option must be supplied.") + elif ( + len( + { + key + for key, value in vars(args).items() + if key == "command" and value is not None + } + ) + > 1 + ): + usage_error("Only one command option may be supplied") + + # Handle missing cache_file + if not args.cache_file: usage_error("Missing expected cache filename") - elif want_ids and not len(ids): + + # Handle commands that require item IDs + if args.command in ["item", "hide", "unhide"] and not args.item_ids: usage_error("Missing expected entry ids") # Open the cache file directly to get the URL it represents try: - db = dbhash.open(cache_file) - url = db["url"] - db.close() - except dbhash.bsddb._db.DBError, e: - print >>sys.stderr, cache_file + ":", e.args[1] - sys.exit(1) + with shelve.open(args.cache_file, "r") as db: + url = db["url"] except KeyError: - print >>sys.stderr, cache_file + ": Probably not a cache file" + print(f"{args.cache_file}: Probably not a cache file", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"{args.cache_file}: {e!s}", file=sys.stderr) sys.exit(1) # Now do it the right way :-) - my_planet = planet.Planet(ConfigParser.ConfigParser()) - my_planet.cache_directory = os.path.dirname(cache_file) + my_planet = planet.Planet(configparser.ConfigParser()) + my_planet.cache_directory = os.path.dirname(args.cache_file) channel = planet.Channel(my_planet, url) - for item_id in ids: + for item_id in args.item_ids: if not channel.has_item(item_id): - print >>sys.stderr, item_id + ": Not in channel" + print(item_id + ": Not in channel", file=sys.stderr) sys.exit(1) # Do the user's bidding - if command == "channel": + if args.command == "channel": print_keys(channel, "Channel Keys") - elif command == "item": - for item_id in ids: + elif args.command == "item": + for item_id in args.item_ids: item = channel.get_item(item_id) print_keys(item, "Item Keys for %s" % item_id) - elif command == "list": - print "Items in Channel:" - for item in channel.items(hidden=1, sorted=1): - print " " + item.id - print " " + time.strftime(planet.TIMEFMT_ISO, item.date) + elif args.command == "list": + print("Items in Channel:") + for item in channel.items(hidden=True, sorted=True): + print(" " + item.id) + print(" " + time.strftime(planet.TIMEFMT_ISO, item.date)) if hasattr(item, "title"): - print " " + fit_str(item.title, 70) + print(" " + fit_str(item.title, 70)) if hasattr(item, "hidden"): - print " (hidden)" + print(" (hidden)") - elif command == "keys": + elif args.command == "keys": keys = {} for item in channel.items(): - for key in item.keys(): + for key in item: keys[key] = 1 - keys = keys.keys() - keys.sort() + keys = sorted(keys.keys()) - print "Keys used in Channel:" + print("Keys used in Channel:") for key in keys: - print " " + key - print + print(" " + key) + print() - print "Use --item to output values of particular items." + print("Use --item to output values of particular items.") - elif command == "hide": - for item_id in ids: + elif args.command == "hide": + for item_id in args.item_ids: item = channel.get_item(item_id) if hasattr(item, "hidden"): - print item_id + ": Already hidden." + print(item_id + ": Already hidden.") else: item.hidden = "yes" channel.cache_write() - print "Done." + print("Done.") - elif command == "unhide": - for item_id in ids: + elif args.command == "unhide": + for item_id in args.item_ids: item = channel.get_item(item_id) if hasattr(item, "hidden"): - del(item.hidden) + del item.hidden else: - print item_id + ": Not hidden." + print(item_id + ": Not hidden.") channel.cache_write() - print "Done." + print("Done.") diff --git a/code/planet.py b/code/planet.py index 41141b67..f4c29886 100755 --- a/code/planet.py +++ b/code/planet.py @@ -1,30 +1,19 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """The Planet aggregator. A flexible and easy-to-use aggregator for generating websites. - -Visit http://www.planetplanet.org/ for more information and to download -the latest version. - -Requires Python 2.1, recommends 2.3. """ -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] -__license__ = "Python" - - -import os -import sys -import time +import argparse +import configparser import locale +import os import socket -import urlparse +import sys +from urllib.parse import urljoin import planet -from ConfigParser import ConfigParser - # Default configuration file path CONFIG_FILE = "config.ini" @@ -32,75 +21,77 @@ PLANET_NAME = "Unconfigured Planet" PLANET_LINK = "Unconfigured Planet" PLANET_FEED = None -OWNER_NAME = "Anonymous Coward" +OWNER_NAME = "Anonymous Coward" OWNER_EMAIL = "" -LOG_LEVEL = "WARNING" -FEED_TIMEOUT = 20 # seconds +LOG_LEVEL = "WARNING" +FEED_TIMEOUT = 20 # seconds # Default template file list TEMPLATE_FILES = "examples/basic/planet.html.tmpl" - -def config_get(config, section, option, default=None, raw=0, vars=None): +def config_get(config, section, option, default=None, raw=False, vars=None): """Get a value from the configuration, with a default.""" if config.has_option(section, option): return config.get(section, option, raw=raw, vars=None) else: return default + def main(): config_file = CONFIG_FILE offline = 0 verbose = 0 - for arg in sys.argv[1:]: - if arg == "-h" or arg == "--help": - print "Usage: planet [options] [CONFIGFILE]" - print - print "Options:" - print " -v, --verbose DEBUG level logging during update" - print " -o, --offline Update the Planet from the cache only" - print " -h, --help Display this help message and exit" - print - sys.exit(0) - elif arg == "-v" or arg == "--verbose": - verbose = 1 - elif arg == "-o" or arg == "--offline": - offline = 1 - elif arg.startswith("-"): - print >>sys.stderr, "Unknown option:", arg - sys.exit(1) - else: - config_file = arg + parser = argparse.ArgumentParser(description="The Planet aggregator") + + parser.add_argument( + "-v", "--verbose", action="store_true", help="DEBUG level logging during update" + ) + parser.add_argument( + "-o", + "--offline", + action="store_true", + help="Update the Planet from the cache only", + ) + parser.add_argument( + "config_file", nargs="?", help="Configuration file", default=CONFIG_FILE + ) + + args = parser.parse_args() + + verbose = args.verbose + offline = args.offline + config_file = args.config_file # Read the configuration file - config = ConfigParser() + config = configparser.ConfigParser() config.read(config_file) if not config.has_section("Planet"): - print >>sys.stderr, "Configuration missing [Planet] section." + print("Configuration missing [Planet] section.", file=sys.stderr) sys.exit(1) # Read the [Planet] config section - planet_name = config_get(config, "Planet", "name", PLANET_NAME) - planet_link = config_get(config, "Planet", "link", PLANET_LINK) - planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) - owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) + planet_name = config_get(config, "Planet", "name", PLANET_NAME) + planet_link = config_get(config, "Planet", "link", PLANET_LINK) + planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) + owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL) if verbose: log_level = "DEBUG" else: - log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) - feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) - template_files = config_get(config, "Planet", "template_files", - TEMPLATE_FILES).split(" ") + log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) + feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) + template_files = config_get( + config, "Planet", "template_files", TEMPLATE_FILES + ).split(" ") # Default feed to the first feed for which there is a template if not planet_feed: for template_file in template_files: name = os.path.splitext(os.path.basename(template_file))[0] - if name.find('atom')>=0 or name.find('rss')>=0: - planet_feed = urlparse.urljoin(planet_link, name) + if name.find("atom") >= 0 or name.find("rss") >= 0: + planet_feed = urljoin(planet_link, name) break # Define locale @@ -108,7 +99,7 @@ def main(): # The user can specify more than one locale (separated by ":") as # fallbacks. locale_ok = False - for user_locale in config.get("Planet", "locale").split(':'): + for user_locale in config.get("Planet", "locale").split(":"): user_locale = user_locale.strip() try: locale.setlocale(locale.LC_ALL, user_locale) @@ -118,7 +109,7 @@ def main(): locale_ok = True break if not locale_ok: - print >>sys.stderr, "Unsupported locale setting." + print("Unsupported locale setting.", file=sys.stderr) sys.exit(1) # Activate logging @@ -134,7 +125,9 @@ def main(): try: feed_timeout = float(feed_timeout) except: - log.warning("Feed timeout set to invalid value '%s', skipping", feed_timeout) + log.warning( + "Feed timeout set to invalid value '%s', skipping", feed_timeout + ) feed_timeout = None if feed_timeout and not offline: @@ -145,10 +138,10 @@ def main(): my_planet = planet.Planet(config) my_planet.run(planet_name, planet_link, template_files, offline) - my_planet.generate_all_files(template_files, planet_name, - planet_link, planet_feed, owner_name, owner_email) + my_planet.generate_all_files( + template_files, planet_name, planet_link, planet_feed, owner_name, owner_email + ) if __name__ == "__main__": main() - diff --git a/code/planet/__init__.py b/code/planet/__init__.py index 929920b0..94d3e885 100644 --- a/code/planet/__init__.py +++ b/code/planet/__init__.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- +#!/usr/bin/env python3 """Planet aggregator library. This package is a library for developing web sites or software that @@ -8,41 +7,39 @@ """ __version__ = "2.0" -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = [ + "Scott James Remnant ", + "Jeff Waugh ", +] __license__ = "Python" +import logging +import os +import re +import shelve +import time +from datetime import datetime +from functools import total_ordering +from hashlib import md5 +from html.parser import HTMLParser +from typing import cast +from xml.sax.saxutils import escape -# Modules available without separate import -import cache import feedparser -import sanitize -import htmltmpl -import sgmllib -try: - import logging -except: - import compat_logging as logging - -# Limit the effect of "from planet import *" -__all__ = ("cache", "feedparser", "htmltmpl", "logging", - "Planet", "Channel", "NewsItem") - +import jinja2 +from markupsafe import Markup -import os -import md5 -import time -import dbhash -import re +from . import cache -try: - from xml.sax.saxutils import escape -except: - def escape(data): - return data.replace("&","&").replace(">",">").replace("<","<") +# Limit the effect of "from planet import *" +__all__ = ( + "Planet", + "Channel", + "NewsItem", +) # Version information (for generator headers) -VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__) +VERSION = "Planet/%s +http://www.planetplanet.org" % __version__ # Default User-Agent header to send when retreiving feeds USER_AGENT = VERSION + " " + feedparser.USER_AGENT @@ -57,7 +54,6 @@ def escape(data): TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00" TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000" - # Log instance to use here log = logging.getLogger("planet") try: @@ -66,23 +62,28 @@ def escape(data): log.warning = log.warn # Defaults for the template file config sections -ENCODING = "utf-8" -ITEMS_PER_PAGE = 60 -DAYS_PER_PAGE = 0 -OUTPUT_DIR = "output" -DATE_FORMAT = "%B %d, %Y %I:%M %p" +ENCODING = "utf-8" +ITEMS_PER_PAGE = 60 +DAYS_PER_PAGE = 0 +OUTPUT_DIR = "output" +DATE_FORMAT = "%B %d, %Y %I:%M %p" NEW_DATE_FORMAT = "%B %d, %Y" ACTIVITY_THRESHOLD = 0 -class stripHtml(sgmllib.SGMLParser): - "remove all tags from the data" - def __init__(self, data): - sgmllib.SGMLParser.__init__(self) - self.result='' - self.feed(data) - self.close() + +class stripHtml(HTMLParser): + """remove all tags from the data""" + + def __init__(self): + super().__init__() + self.result = [] + def handle_data(self, data): - if data: self.result+=data + self.result.append(data) + + def get_data(self): + return "".join(self.result) + def template_info(item, date_format): """Produce a dictionary of template information.""" @@ -95,8 +96,8 @@ def template_info(item, date_format): info[key + "_822"] = time.strftime(TIMEFMT_822, date) else: info[key] = item[key] - if 'title' in item.keys(): - info['title_plain'] = stripHtml(info['title']).result + if "title" in item.keys(): + info["title_plain"] = Markup(info["title"]) return info @@ -114,18 +115,25 @@ class Planet: filter A regular expression that articles must match. exclude A regular expression that articles must not match. """ + def __init__(self, config): self.config = config self._channels = [] self.user_agent = USER_AGENT - self.cache_directory = CACHE_DIRECTORY - self.new_feed_items = NEW_FEED_ITEMS + if self.config.has_option("Planet", "cache_directory"): + self.cache_directory = self.config.get("Planet", "cache_directory") + else: + self.cache_directory = CACHE_DIRECTORY + if self.config.has_option("Planet", "new_feed_items"): + self.new_feed_items = int(self.config.get("Planet", "new_feed_items")) + else: + self.new_feed_items = NEW_FEED_ITEMS self.filter = None self.exclude = None - def tmpl_config_get(self, template, option, default=None, raw=0, vars=None): + def tmpl_config_get(self, template, option, default=None, raw=False, vars=None): """Get a template value from the configuration, with a default.""" if self.config.has_option(template, option): return self.config.get(template, option, raw=raw, vars=None) @@ -135,47 +143,51 @@ def tmpl_config_get(self, template, option, default=None, raw=0, vars=None): return default def gather_channel_info(self, template_file="Planet"): - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) + date_format = self.tmpl_config_get( + template_file, "date_format", DATE_FORMAT, raw=True + ) - activity_threshold = int(self.tmpl_config_get(template_file, - "activity_threshold", - ACTIVITY_THRESHOLD)) + activity_threshold = int( + self.tmpl_config_get( + template_file, "activity_threshold", ACTIVITY_THRESHOLD + ) + ) if activity_threshold: - activity_horizon = \ - time.gmtime(time.time()-86400*activity_threshold) + activity_horizon = time.gmtime(time.time() - 86400 * activity_threshold) else: activity_horizon = 0 channels = {} channels_list = [] - for channel in self.channels(hidden=1): + for channel in self.channels(hidden=True): channels[channel] = template_info(channel, date_format) channels_list.append(channels[channel]) # identify inactive feeds if activity_horizon: - latest = channel.items(sorted=1) - if len(latest)==0 or latest[0].date < activity_horizon: - channels[channel]["message"] = \ + latest = channel.items(sorted=True) + if len(latest) == 0 or latest[0].date < activity_horizon: + channels[channel]["message"] = ( "no activity in %d days" % activity_threshold + ) # report channel level errors - if not channel.url_status: continue + if not channel.url_status: + continue status = int(channel.url_status) if status == 403: - channels[channel]["message"] = "403: forbidden" + channels[channel]["message"] = "403: forbidden" elif status == 404: - channels[channel]["message"] = "404: not found" + channels[channel]["message"] = "404: not found" elif status == 408: - channels[channel]["message"] = "408: request timeout" + channels[channel]["message"] = "408: request timeout" elif status == 410: - channels[channel]["message"] = "410: gone" + channels[channel]["message"] = "410: gone" elif status == 500: - channels[channel]["message"] = "internal server error" + channels[channel]["message"] = "internal server error" elif status >= 400: - channels[channel]["message"] = "http status %s" % status + channels[channel]["message"] = "http status %s" % status return channels, channels_list @@ -184,55 +196,58 @@ def gather_items_info(self, channels, template_file="Planet", channel_list=None) prev_date = [] prev_channel = None - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) - items_per_page = int(self.tmpl_config_get(template_file, - "items_per_page", ITEMS_PER_PAGE)) - days_per_page = int(self.tmpl_config_get(template_file, - "days_per_page", DAYS_PER_PAGE)) - new_date_format = self.tmpl_config_get(template_file, - "new_date_format", NEW_DATE_FORMAT, raw=1) - - for newsitem in self.items(max_items=items_per_page, - max_days=days_per_page, - channels=channel_list): + date_format = self.tmpl_config_get( + template_file, "date_format", DATE_FORMAT, raw=True + ) + items_per_page = int( + cast( + int, + self.tmpl_config_get(template_file, "items_per_page", ITEMS_PER_PAGE), + ) + ) + days_per_page = int( + cast( + int, self.tmpl_config_get(template_file, "days_per_page", DAYS_PER_PAGE) + ) + ) + new_date_format = self.tmpl_config_get( + template_file, "new_date_format", NEW_DATE_FORMAT, raw=True + ) + + for newsitem in self.items( + max_items=items_per_page, max_days=days_per_page, channels=channel_list + ): item_info = template_info(newsitem, date_format) chan_info = channels[newsitem._channel] for k, v in chan_info.items(): item_info["channel_" + k] = v - + # Check for the start of a new day if prev_date[:3] != newsitem.date[:3]: prev_date = newsitem.date - item_info["new_date"] = time.strftime(new_date_format, - newsitem.date) - + item_info["new_date"] = time.strftime(new_date_format, newsitem.date) + # Check for the start of a new channel - if item_info.has_key("new_date") \ - or prev_channel != newsitem._channel: + if "new_date" in item_info or prev_channel != newsitem._channel: prev_channel = newsitem._channel item_info["new_channel"] = newsitem._channel.url - + items_list.append(item_info) return items_list - def run(self, planet_name, planet_link, template_files, offline = False): + def run(self, planet_name, planet_link, template_files, offline=False): log = logging.getLogger("planet.runner") # Create a planet log.info("Loading cached data") - if self.config.has_option("Planet", "cache_directory"): - self.cache_directory = self.config.get("Planet", "cache_directory") - if self.config.has_option("Planet", "new_feed_items"): - self.new_feed_items = int(self.config.get("Planet", "new_feed_items")) - self.user_agent = "%s +%s %s" % (planet_name, planet_link, - self.user_agent) + self.user_agent = f"{planet_name} +{planet_link} {self.user_agent}" if self.config.has_option("Planet", "filter"): self.filter = self.config.get("Planet", "filter") # The other configuration blocks are channels to subscribe to for feed_url in self.config.sections(): + # The "Planet" config section is a special case. We also allow template-file specific configuration, apparently :D if feed_url == "Planet" or feed_url in template_files: continue @@ -242,101 +257,117 @@ def run(self, planet_name, planet_link, template_files, offline = False): # Update it try: - if not offline and not channel.url_status == '410': + if not offline and channel.url_status != "410": channel.update() except KeyboardInterrupt: raise except: log.exception("Update of <%s> failed", feed_url) - def generate_all_files(self, template_files, planet_name, - planet_link, planet_feed, owner_name, owner_email): - + def generate_all_files( + self, + template_files, + planet_name, + planet_link, + planet_feed, + owner_name, + owner_email, + ): log = logging.getLogger("planet.runner") + # Go-go-gadget-template for template_file in template_files: - manager = htmltmpl.TemplateManager() + # Jinja2 Environment setup for template loading + template_loader = jinja2.FileSystemLoader( + searchpath=os.path.dirname(template_file) + ) + jinja_env = jinja2.Environment(loader=template_loader) + log.info("Processing template %s", template_file) + + # Fallback logic just in case template is not found in the path try: - template = manager.prepare(template_file) - except htmltmpl.TemplateError: - template = manager.prepare(os.path.basename(template_file)) + template = jinja_env.get_template(os.path.basename(template_file)) + except jinja2.TemplateNotFound: + template = jinja_env.get_template(template_file) + # Read the configuration - output_dir = self.tmpl_config_get(template_file, - "output_dir", OUTPUT_DIR) - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) + output_dir = self.tmpl_config_get(template_file, "output_dir", OUTPUT_DIR) + date_format = self.tmpl_config_get( + template_file, "date_format", DATE_FORMAT, raw=True + ) encoding = self.tmpl_config_get(template_file, "encoding", ENCODING) - - # We treat each template individually + + # Template processing base = os.path.splitext(os.path.basename(template_file))[0] url = os.path.join(planet_link, base) output_file = os.path.join(output_dir, base) # Gather information - channels, channels_list = self.gather_channel_info(template_file) - items_list = self.gather_items_info(channels, template_file) - - # Gather item information - - # Process the template - tp = htmltmpl.TemplateProcessor(html_escape=0) - tp.set("Items", items_list) - tp.set("Channels", channels_list) - - # Generic information - tp.set("generator", VERSION) - tp.set("name", planet_name) - tp.set("link", planet_link) - tp.set("owner_name", owner_name) - tp.set("owner_email", owner_email) - tp.set("url", url) - + channels, channels_list = self.gather_channel_info(template_file) + items_list = self.gather_items_info(channels, template_file) + + # Prepare the template data (replacing tp.set() calls) + template_data = { + "Items": items_list, + "Channels": channels_list, + "generator": VERSION, + "name": planet_name, + "link": planet_link, + "owner_name": owner_name, + "owner_email": owner_email, + "url": url, + "date": time.strftime(date_format, time.gmtime()), + "date_iso": time.strftime(TIMEFMT_ISO, time.gmtime()), + "date_822": time.strftime(TIMEFMT_822, time.gmtime()), + } + if planet_feed: - tp.set("feed", planet_feed) - tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom') - - # Update time - date = time.gmtime() - tp.set("date", time.strftime(date_format, date)) - tp.set("date_iso", time.strftime(TIMEFMT_ISO, date)) - tp.set("date_822", time.strftime(TIMEFMT_822, date)) + template_data["feed"] = planet_feed + template_data["feedtype"] = "rss" if "rss" in planet_feed else "atom" + # Render template try: log.info("Writing %s", output_file) - output_fd = open(output_file, "w") - if encoding.lower() in ("utf-8", "utf8"): - # UTF-8 output is the default because we use that internally - output_fd.write(tp.process(template)) - elif encoding.lower() in ("xml", "html", "sgml"): - # Magic for Python 2.3 users - output = tp.process(template).decode("utf-8") - output_fd.write(output.encode("ascii", "xmlcharrefreplace")) - else: - # Must be a "known" encoding - output = tp.process(template).decode("utf-8") - output_fd.write(output.encode(encoding, "replace")) - output_fd.close() + + rendered_output = template.render(template_data) + + with open(output_file, "w", encoding="utf-8") as output_fd: + if encoding.lower() in ("xml", "html", "sgml"): + output_fd.write( + rendered_output.encode("ascii", "xmlcharrefreplace").decode( + "utf-8" + ) + ) + elif encoding.lower() not in ("utf-8", "utf8"): + # Non-UTF8 encoding + output_fd.write( + rendered_output.encode(encoding, "replace").decode("utf-8") + ) + else: + output_fd.write(rendered_output) + except KeyboardInterrupt: raise - except: + except Exception: log.exception("Write of %s failed", output_file) - def channels(self, hidden=0, sorted=1): + def channels(self, hidden=False, sorted=True): """Return the list of channels.""" channels = [] for channel in self._channels: - if hidden or not channel.has_key("hidden"): + if hidden or "hidden" not in channel: channels.append((channel.name, channel)) if sorted: channels.sort() - return [ c[-1] for c in channels ] + return [c[-1] for c in channels] def find_by_basename(self, basename): for channel in self._channels: - if basename == channel.cache_basename(): return channel + if basename == channel.cache_basename(): + return channel def subscribe(self, channel): """Subscribe the planet to the channel.""" @@ -346,7 +377,9 @@ def unsubscribe(self, channel): """Unsubscribe the planet from the channel.""" self._channels.remove(channel) - def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): + def items( + self, hidden=False, sorted=True, max_items=False, max_days=False, channels=None + ): """Return an optionally filtered list of items in the channel. The filters are applied in the following order: @@ -361,7 +394,7 @@ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): be returned. If max_days is non-zero then any items older than the newest by - this number of days won't be returned. Requires sorted=1 to work. + this number of days won't be returned. Requires sorted= True to work. The sharp-eyed will note that this looks a little strange code-wise, @@ -372,56 +405,64 @@ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): """ planet_filter_re = None if self.filter: - planet_filter_re = re.compile(self.filter, re.I) + planet_filter_re = re.compile(self.filter, re.IGNORECASE) planet_exclude_re = None if self.exclude: - planet_exclude_re = re.compile(self.exclude, re.I) - + planet_exclude_re = re.compile(self.exclude, re.IGNORECASE) + items = [] seen_guids = {} - if not channels: channels=self.channels(hidden=hidden, sorted=0) + if not channels: + channels = self.channels(hidden=hidden, sorted=False) for channel in channels: for item in channel._items.values(): - if hidden or not item.has_key("hidden"): - + if hidden or "hidden" not in item: channel_filter_re = None if channel.filter: - channel_filter_re = re.compile(channel.filter, - re.I) + channel_filter_re = re.compile(channel.filter, re.IGNORECASE) channel_exclude_re = None if channel.exclude: - channel_exclude_re = re.compile(channel.exclude, - re.I) - if (planet_filter_re or planet_exclude_re \ - or channel_filter_re or channel_exclude_re): + channel_exclude_re = re.compile(channel.exclude, re.IGNORECASE) + if ( + planet_filter_re + or planet_exclude_re + or channel_filter_re + or channel_exclude_re + ): title = "" - if item.has_key("title"): + if "title" in item: title = item.title content = item.get_content("content") if planet_filter_re: - if not (planet_filter_re.search(title) \ - or planet_filter_re.search(content)): + if not ( + planet_filter_re.search(title) + or planet_filter_re.search(content) + ): continue if planet_exclude_re: - if (planet_exclude_re.search(title) \ - or planet_exclude_re.search(content)): + if planet_exclude_re.search(title) or planet_exclude_re.search( + content + ): continue if channel_filter_re: - if not (channel_filter_re.search(title) \ - or channel_filter_re.search(content)): + if not ( + channel_filter_re.search(title) + or channel_filter_re.search(content) + ): continue if channel_exclude_re: - if (channel_exclude_re.search(title) \ - or channel_exclude_re.search(content)): + if channel_exclude_re.search( + title + ) or channel_exclude_re.search(content): continue - if not seen_guids.has_key(item.id): - seen_guids[item.id] = 1; - items.append((time.mktime(item.date), item.order, item)) + if item.id not in seen_guids: + seen_guids[item.id] = 1 + items.append((item.time_since_epoch, item.order, item)) # Sort the list if sorted: @@ -443,7 +484,8 @@ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): items = items[:max_count] break - return [ i[-1] for i in items ] + return [i[-1] for i in items] + class Channel(cache.CachedInfo): """A list of news items. @@ -497,16 +539,28 @@ class Channel(cache.CachedInfo): Some feeds may define additional properties to those above. """ - IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories", - "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit") + + IGNORE_KEYS = ( + "links", + "contributors", + "textinput", + "cloud", + "categories", + "url", + "href", + "url_etag", + "url_modified", + "tags", + "itunes_explicit", + ) def __init__(self, planet, url): if not os.path.isdir(planet.cache_directory): os.makedirs(planet.cache_directory) cache_filename = cache.filename(planet.cache_directory, url) - cache_file = dbhash.open(cache_filename, "c", 0666) + cache_file = shelve.open(cache_filename, "c") - cache.CachedInfo.__init__(self, cache_file, url, root=1) + cache.CachedInfo.__init__(self, cache_file, url, root=True) self._items = {} self._planet = planet @@ -524,16 +578,25 @@ def __init__(self, planet, url): self.exclude = None self.next_order = "0" self.cache_read() - self.cache_read_entries() + try: + self.cache_read_entries() + except SystemError: + # This can be triggered by https://github.com/python/cpython/issues/91228 (I think!) on + # some DBs, but really, only on macOS. While that is not how this is run in production, + # it's kinda nice to test. So, we catch the failure here, and flush the etag / modified + # fields so that update always works. + log.error(f"DB corruption for {url}; reloading the feed") + self.url_etag = None + self.url_modified = None if planet.config.has_section(url): for option in planet.config.options(url): value = planet.config.get(url, option) - self.set_as_string(option, value, cached=0) + self.set_as_string(option, value, cached=False) def has_item(self, id_): """Check whether the item exists in the channel.""" - return self._items.has_key(id_) + return id_ in self._items def get_item(self, id_): """Return the item from the channel.""" @@ -542,49 +605,57 @@ def get_item(self, id_): # Special methods __contains__ = has_item - def items(self, hidden=0, sorted=0): + def items(self, hidden=False, sorted=False): """Return the item list.""" items = [] for item in self._items.values(): - if hidden or not item.has_key("hidden"): - items.append((time.mktime(item.date), item.order, item)) + if hidden or "hidden" not in item: + try: + items.append((item.time_since_epoch, item.order, item)) + except OverflowError: + log.warning(f"Unable to parse date for {item.id}") if sorted: items.sort() items.reverse() - return [ i[-1] for i in items ] + return [i[-1] for i in items] def __iter__(self): """Iterate the sorted item list.""" - return iter(self.items(sorted=1)) + return iter(self.items(sorted=True)) def cache_read_entries(self): """Read entry information from the cache.""" + keys = self._cache.keys() for key in keys: - if key.find(" ") != -1: continue - if self.has_key(key): continue + if key.find(" ") != -1: + continue + + # We can't use __contains__ / `in` syntax with the Cache type; it does something + # different (using `has_item` above) + if self.has_key(key): + continue item = NewsItem(self, key) self._items[key] = item def cache_basename(self): - return cache.filename('',self._id) + return cache.filename("", self._id) - def cache_write(self, sync=1): + def cache_write(self, sync: bool = True): """Write channel and item information to the cache.""" for item in self._items.values(): - item.cache_write(sync=0) + item.cache_write(sync=False) for item in self._expired: - item.cache_clear(sync=0) + item.cache_clear(sync=False) cache.CachedInfo.cache_write(self, sync) self._expired = [] def feed_information(self): - """ - Returns a description string for the feed embedded in this channel. + """Returns a description string for the feed embedded in this channel. This will usually simply be the feed url embedded in <>, but in the case where the current self.url has changed from the original @@ -596,7 +667,7 @@ def feed_information(self): if self.url == self.configured_url: return "<%s>" % self.url else: - return "<%s> (formerly <%s>)" % (self.url, self.configured_url) + return f"<{self.url}> (formerly <{self.configured_url}>)" def update(self): """Download the feed to refresh the information. @@ -604,51 +675,61 @@ def update(self): This does the actual work of pulling down the feed and if it changes updates the cached information about the feed and entries within it. """ - info = feedparser.parse(self.url, - etag=self.url_etag, modified=self.url_modified, - agent=self._planet.user_agent) - if info.has_key("status"): - self.url_status = str(info.status) - elif info.has_key("entries") and len(info.entries)>0: - self.url_status = str(200) - elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout': - self.url_status = str(408) + info = feedparser.parse( + self.url, + etag=self.url_etag, + modified=self.url_modified, + agent=self._planet.user_agent, + ) + + if hasattr(info, "status"): + self.url_status = str(info.status) + elif hasattr(info, "entries") and info.entries: + self.url_status = "200" + elif hasattr(info, "bozo") and info.bozo and hasattr(info, "bozo_exception"): + if info.bozo_exception.__class__.__name__ == "Timeout": + self.url_status = "408" + else: + self.url_status = "500" else: - self.url_status = str(500) + self.url_status = "500" - if self.url_status == '301' and \ - (info.has_key("entries") and len(info.entries)>0): + if self.url_status == "301" and ("entries" in info and len(info.entries) > 0): log.warning("Feed has moved from <%s> to <%s>", self.url, info.url) try: - os.link(cache.filename(self._planet.cache_directory, self.url), - cache.filename(self._planet.cache_directory, info.url)) + os.link( + cache.filename(self._planet.cache_directory, self.url), + cache.filename(self._planet.cache_directory, info.url), + ) except: pass self.url = info.url - elif self.url_status == '304': + elif self.url_status == "304": log.info("Feed %s unchanged", self.feed_information()) return - elif self.url_status == '410': + elif self.url_status == "410": log.info("Feed %s gone", self.feed_information()) self.cache_write() return - elif self.url_status == '408': + elif self.url_status == "408": log.warning("Feed %s timed out", self.feed_information()) return elif int(self.url_status) >= 400: - log.error("Error %s while updating feed %s", - self.url_status, self.feed_information()) + log.error( + "Error %s while updating feed %s", + self.url_status, + self.feed_information(), + ) return else: log.info("Updating feed %s", self.feed_information()) - self.url_etag = info.has_key("etag") and info.etag or None - self.url_modified = info.has_key("modified") and info.modified or None + self.url_etag = "etag" in info and info.etag or None + self.url_modified = "modified" in info and info.modified or None if self.url_etag is not None: - log.debug("E-Tag: %s", self.url_etag) + log.debug(f"E-Tag: {self.url_etag}") if self.url_modified is not None: - log.debug("Last Modified: %s", - time.strftime(TIMEFMT_ISO, self.url_modified)) + log.debug(f"Last Modified: {self.url_modified}") self.update_info(info.feed) self.update_entries(info.entries) @@ -665,51 +746,50 @@ def update_info(self, feed): if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass - elif feed.has_key(key + "_parsed"): + elif key + "_parsed" in feed: # Ignore unparsed date fields pass elif key.endswith("_detail"): # retain name and email sub-fields - if feed[key].has_key('name') and feed[key].name: - self.set_as_string(key.replace("_detail","_name"), \ - feed[key].name) - if feed[key].has_key('email') and feed[key].email: - self.set_as_string(key.replace("_detail","_email"), \ - feed[key].email) + if "name" in feed[key] and feed[key].name: + self.set_as_string(key.replace("_detail", "_name"), feed[key].name) + if "email" in feed[key] and feed[key].email: + self.set_as_string( + key.replace("_detail", "_email"), feed[key].email + ) elif key == "items": # Ignore items field pass elif key.endswith("_parsed"): # Date fields if feed[key] is not None: - self.set_as_date(key[:-len("_parsed")], feed[key]) + self.set_as_date(key[: -len("_parsed")], feed[key]) elif key == "image": # Image field: save all the information - if feed[key].has_key("url"): + if "url" in feed[key]: self.set_as_string(key + "_url", feed[key].url) - if feed[key].has_key("link"): + if "link" in feed[key]: self.set_as_string(key + "_link", feed[key].link) - if feed[key].has_key("title"): + if "title" in feed[key]: self.set_as_string(key + "_title", feed[key].title) - if feed[key].has_key("width"): + if "width" in feed[key]: self.set_as_string(key + "_width", str(feed[key].width)) - if feed[key].has_key("height"): + if "height" in feed[key]: self.set_as_string(key + "_height", str(feed[key].height)) - elif isinstance(feed[key], (str, unicode)): + elif isinstance(feed[key], str): # String fields try: - detail = key + '_detail' - if feed.has_key(detail) and feed[detail].has_key('type'): - if feed[detail].type == 'text/html': - feed[key] = sanitize.HTML(feed[key]) - elif feed[detail].type == 'text/plain': + detail = key + "_detail" + if detail in feed and "type" in feed[detail]: + if feed[detail].type == "text/html": + feed[key] = Markup(feed[key]) + elif feed[detail].type == "text/plain": feed[key] = escape(feed[key]) self.set_as_string(key, feed[key]) except KeyboardInterrupt: raise except: - log.exception("Ignored '%s' of <%s>, unknown format", - key, self.url) + log.exception("Ignored '%s' of <%s>, unknown format", key, self.url) def update_entries(self, entries): """Update entries from the feed. @@ -736,16 +816,16 @@ def update_entries(self, entries): feed_items = [] for entry in entries: # Try really hard to find some kind of unique identifier - if entry.has_key("id"): + if "id" in entry: entry_id = cache.utf8(entry.id) - elif entry.has_key("link"): + elif "link" in entry: entry_id = cache.utf8(entry.link) - elif entry.has_key("title"): - entry_id = (self.url + "/" - + md5.new(cache.utf8(entry.title)).hexdigest()) - elif entry.has_key("summary"): - entry_id = (self.url + "/" - + md5.new(cache.utf8(entry.summary)).hexdigest()) + elif "title" in entry: + entry_id = self.url + "/" + md5.new(cache.utf8(entry.title)).hexdigest() + elif "summary" in entry: + entry_id = ( + self.url + "/" + md5.new(cache.utf8(entry.summary)).hexdigest() + ) else: log.error("Unable to find or generate id, entry ignored") continue @@ -761,8 +841,11 @@ def update_entries(self, entries): feed_items.append(entry_id) # Hide excess items the first time through - if self.last_updated is None and self._planet.new_feed_items \ - and len(feed_items) > self._planet.new_feed_items: + if ( + self.last_updated is None + and self._planet.new_feed_items + and len(feed_items) > self._planet.new_feed_items + ): item.hidden = "yes" log.debug("Marked <%s> as hidden (new feed)", entry_id) @@ -774,13 +857,13 @@ def update_entries(self, entries): # Check for expired or replaced items feed_count = len(feed_items) log.debug("Items in Feed: %d", feed_count) - for item in self.items(sorted=1): + for item in self.items(sorted=True): if feed_count < 1: break elif item.id in feed_items: feed_count -= 1 - elif item._channel.url_status != '226': - del(self._items[item.id]) + elif item._channel.url_status != "226": + del self._items[item.id] self._expired.append(item) log.debug("Removed expired or replaced item <%s>", item.id) @@ -792,6 +875,8 @@ def get_name(self, key): return "" + +@total_ordering class NewsItem(cache.CachedInfo): """An item of news. @@ -830,15 +915,23 @@ class NewsItem(cache.CachedInfo): Some feeds may define additional properties to those above. """ - IGNORE_KEYS = ("categories", "contributors", "enclosures", "links", - "guidislink", "date", "tags") + + IGNORE_KEYS = ( + "categories", + "contributors", + "enclosures", + "links", + "guidislink", + "date", + "tags", + ) def __init__(self, channel, id_): cache.CachedInfo.__init__(self, channel._cache, id_) self._channel = channel self.id = id_ - self.id_hash = md5.new(id_).hexdigest() + self.id_hash = md5(id_.encode()).hexdigest() self.date = None self.order = None self.content = None @@ -850,67 +943,89 @@ def update(self, entry): if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass - elif entry.has_key(key + "_parsed"): + elif key + "_parsed" in entry: # Ignore unparsed date fields pass elif key.endswith("_detail"): # retain name, email, and language sub-fields - if entry[key].has_key('name') and entry[key].name: - self.set_as_string(key.replace("_detail","_name"), \ - entry[key].name) - if entry[key].has_key('email') and entry[key].email: - self.set_as_string(key.replace("_detail","_email"), \ - entry[key].email) - if entry[key].has_key('language') and entry[key].language and \ - (not self._channel.has_key('language') or \ - entry[key].language != self._channel.language): - self.set_as_string(key.replace("_detail","_language"), \ - entry[key].language) + if "name" in entry[key] and entry[key].name: + self.set_as_string(key.replace("_detail", "_name"), entry[key].name) + if "email" in entry[key] and entry[key].email: + self.set_as_string( + key.replace("_detail", "_email"), entry[key].email + ) + if ( + "language" in entry[key] + and entry[key].language + and ( + "language" not in self._channel + or entry[key].language != self._channel.language + ) + ): + self.set_as_string( + key.replace("_detail", "_language"), entry[key].language + ) elif key.endswith("_parsed"): # Date fields if entry[key] is not None: - self.set_as_date(key[:-len("_parsed")], entry[key]) + self.set_as_date(key[: -len("_parsed")], entry[key]) elif key == "source": # Source field: save both url and value - if entry[key].has_key("value"): + if "value" in entry[key]: self.set_as_string(key + "_name", entry[key].value) - if entry[key].has_key("url"): + if "url" in entry[key]: self.set_as_string(key + "_link", entry[key].url) elif key == "content": # Content field: concatenate the values value = "" for item in entry[key]: - if item.type == 'text/html': - item.value = sanitize.HTML(item.value) - elif item.type == 'text/plain': + if item.type == "text/html": + item.value = Markup(item.value) + elif item.type == "text/plain": item.value = escape(item.value) - if item.has_key('language') and item.language and \ - (not self._channel.has_key('language') or - item.language != self._channel.language) : + if ( + "language" in item + and item.language + and ( + "language" not in self._channel + or item.language != self._channel.language + ) + ): self.set_as_string(key + "_language", item.language) value += cache.utf8(item.value) self.set_as_string(key, value) - elif isinstance(entry[key], (str, unicode)): + elif isinstance(entry[key], str): # String fields try: - detail = key + '_detail' - if entry.has_key(detail): - if entry[detail].has_key('type'): - if entry[detail].type == 'text/html': - entry[key] = sanitize.HTML(entry[key]) - elif entry[detail].type == 'text/plain': + detail = key + "_detail" + if detail in entry: + if "type" in entry[detail]: + if entry[detail].type == "text/html": + entry[key] = Markup(entry[key]) + elif entry[detail].type == "text/plain": entry[key] = escape(entry[key]) self.set_as_string(key, entry[key]) except KeyboardInterrupt: raise except: - log.exception("Ignored '%s' of <%s>, unknown format", - key, self.id) + log.exception("Ignored '%s' of <%s>, unknown format", key, self.id) # Generate the date field if we need to self.get_date("date") - def get_date(self, key): + def __eq__(self, other): + return self.id == other.id + + def __lt__(self, other): + # compare on the date field, and then the order field + if self.date < other.date: + return True + elif self.date == other.date: + return self.order < other.order + else: + return False + + def get_date(self, key: str) -> cache.TimeTuple | None: """Get (or update) the date key. We check whether the date the entry claims to have been changed is @@ -923,31 +1038,39 @@ def get_date(self, key): entries appear in posting sequence but don't overlap entries added in previous updates and don't creep into the next one. """ - for other_key in ("updated", "modified", "published", "issued", "created"): - if self.has_key(other_key): + if other_key in self: date = self.get_as_date(other_key) break else: date = None if date is not None: - if date > self._channel.updated: - date = self._channel.updated -# elif date < self._channel.last_updated: -# date = self._channel.updated - elif self.has_key(key) and self.key_type(key) != self.NULL: + if self._channel.updated is not None: + if date > self._channel.updated: + date = self._channel.updated + # elif date < self._channel.last_updated: + # date = self._channel.updated + elif key in self and self.key_type(key) != self.NULL: return self.get_as_date(key) else: date = self._channel.updated - self.set_as_date(key, date) + if date is not None: + self.set_as_date(key, date) return date + @property + def time_since_epoch(self) -> float: + try: + return time.mktime(self.date) + except OverflowError: + return 0.0 + def get_content(self, key): """Return the key containing the content.""" for key in ("content", "tagline", "summary"): - if self.has_key(key) and self.key_type(key) != self.NULL: + if key in self and self.key_type(key) != self.NULL: return self.get_as_string(key) return "" diff --git a/code/planet/atomstyler.py b/code/planet/atomstyler.py index 9220702c..88d3a211 100644 --- a/code/planet/atomstyler.py +++ b/code/planet/atomstyler.py @@ -1,124 +1,137 @@ -from xml.dom import minidom, Node -from urlparse import urlparse, urlunparse -from xml.parsers.expat import ExpatError -from htmlentitydefs import name2codepoint import re +from html.entities import name2codepoint +from urllib.parse import urlparse, urlunparse +from xml.dom import Node, minidom +from xml.parsers.expat import ExpatError + # select and apply an xml:base for this entry class relativize: - def __init__(self, parent): - self.score = {} - self.links = [] - self.collect_and_tally(parent) - self.base = self.select_optimal_base() - if self.base: - if not parent.hasAttribute('xml:base'): - self.rebase(parent) - parent.setAttribute('xml:base', self.base) - - # collect and tally cite, href and src attributes - def collect_and_tally(self,parent): - uri = None - if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') - if parent.hasAttribute('href'): uri=parent.getAttribute('href') - if parent.hasAttribute('src'): uri=parent.getAttribute('src') - - if uri: - parts=urlparse(uri) - if parts[0].lower() == 'http': - parts = (parts[1]+parts[2]).split('/') - base = None - for i in range(1,len(parts)): - base = tuple(parts[0:i]) - self.score[base] = self.score.get(base,0) + len(base) - if base and base not in self.links: self.links.append(base) + def __init__(self, parent): + self.score = {} + self.links = [] + self.collect_and_tally(parent) + self.base = self.select_optimal_base() + if self.base: + if not parent.hasAttribute("xml:base"): + self.rebase(parent) + parent.setAttribute("xml:base", self.base) + + # collect and tally cite, href and src attributes + def collect_and_tally(self, parent): + uri = None + if parent.hasAttribute("cite"): + uri = parent.getAttribute("cite") + if parent.hasAttribute("href"): + uri = parent.getAttribute("href") + if parent.hasAttribute("src"): + uri = parent.getAttribute("src") + + if uri: + parts = urlparse(uri) + if parts[0].lower() == "http": + parts = (parts[1] + parts[2]).split("/") + base = None + for i in range(1, len(parts)): + base = tuple(parts[0:i]) + self.score[base] = self.score.get(base, 0) + len(base) + if base and base not in self.links: + self.links.append(base) + + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + self.collect_and_tally(node) + + # select the xml:base with the highest score + def select_optimal_base(self): + if not self.score: + return None + for link in self.links: + self.score[link] = 0 + winner = max(self.score.values()) + if not winner: + return None + for key in self.score.keys(): + if self.score[key] == winner: + if winner == len(key): + return None + return urlunparse(("http", key[0], "/".join(key[1:]), "", "", "")) + "/" + + # rewrite cite, href and src attributes using this base + def rebase(self, parent): + uri = None + if parent.hasAttribute("cite"): + uri = parent.getAttribute("cite") + if parent.hasAttribute("href"): + uri = parent.getAttribute("href") + if parent.hasAttribute("src"): + uri = parent.getAttribute("src") + if uri and uri.startswith(self.base): + uri = uri[len(self.base) :] or "." + if parent.hasAttribute("href"): + uri = parent.setAttribute("href", uri) + if parent.hasAttribute("src"): + uri = parent.setAttribute("src", uri) + + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + self.rebase(node) - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - self.collect_and_tally(node) - - # select the xml:base with the highest score - def select_optimal_base(self): - if not self.score: return None - for link in self.links: - self.score[link] = 0 - winner = max(self.score.values()) - if not winner: return None - for key in self.score.keys(): - if self.score[key] == winner: - if winner == len(key): return None - return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/' - - # rewrite cite, href and src attributes using this base - def rebase(self,parent): - uri = None - if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') - if parent.hasAttribute('href'): uri=parent.getAttribute('href') - if parent.hasAttribute('src'): uri=parent.getAttribute('src') - if uri and uri.startswith(self.base): - uri = uri[len(self.base):] or '.' - if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri) - if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri) - - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - self.rebase(node) # convert type="html" to type="plain" or type="xhtml" as appropriate def retype(parent): - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - - if node.hasAttribute('type') and node.getAttribute('type') == 'html': - if len(node.childNodes)==0: - node.removeAttribute('type') - elif len(node.childNodes)==1: - - # replace html entity defs with utf-8 - chunks=re.split('&(\w+);', node.childNodes[0].nodeValue) - for i in range(1,len(chunks),2): - if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']: - chunks[i] ='&' + chunks[i] +';' - elif chunks[i] in name2codepoint: - chunks[i]=unichr(name2codepoint[chunks[i]]) - else: - chunks[i]='&' + chunks[i] + ';' - text = u"".join(chunks) - - try: - # see if the resulting text is a well-formed XML fragment - div = '
%s
' - data = minidom.parseString((div % text.encode('utf-8'))) - - if text.find('<') < 0: - # plain text - node.removeAttribute('type') - text = data.documentElement.childNodes[0].nodeValue - node.childNodes[0].replaceWholeText(text) - - elif len(text) > 80: - # xhtml - node.setAttribute('type', 'xhtml') - node.removeChild(node.childNodes[0]) - node.appendChild(data.documentElement) - - except ExpatError: - # leave as html - pass - - else: - # recurse - retype(node) - - if parent.nodeName == 'entry': - relativize(parent) - -if __name__ == '__main__': - - # run styler on each file mention on the command line - import sys - for feed in sys.argv[1:]: - doc = minidom.parse(feed) - doc.normalize() - retype(doc.documentElement) - open(feed,'w').write(doc.toxml('utf-8')) + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + if node.hasAttribute("type") and node.getAttribute("type") == "html": + if len(node.childNodes) == 0: + node.removeAttribute("type") + elif len(node.childNodes) == 1: + # replace html entity defs with utf-8 + chunks = re.split(r"&(\w+);", node.childNodes[0].nodeValue) + for i in range(1, len(chunks), 2): + if chunks[i] in ["amp", "lt", "gt", "apos", "quot"]: + chunks[i] = "&" + chunks[i] + ";" + elif chunks[i] in name2codepoint: + chunks[i] = chr(name2codepoint[chunks[i]]) + else: + chunks[i] = "&" + chunks[i] + ";" + text = "".join(chunks) + + try: + # see if the resulting text is a well-formed XML fragment + div = '
%s
' + data = minidom.parseString(div % text.encode("utf-8")) + + if text.find("<") < 0: + # plain text + node.removeAttribute("type") + text = data.documentElement.childNodes[0].nodeValue + node.childNodes[0].replaceWholeText(text) + + elif len(text) > 80: + # xhtml + node.setAttribute("type", "xhtml") + node.removeChild(node.childNodes[0]) + node.appendChild(data.documentElement) + + except ExpatError: + # leave as html + pass + + else: + # recurse + retype(node) + + if parent.nodeName == "entry": + relativize(parent) + + +if __name__ == "__main__": + # run styler on each file mention on the command line + import sys + + for feed in sys.argv[1:]: + doc = minidom.parse(feed) + doc.normalize() + retype(doc.documentElement) + open(feed, "w").write(doc.toxml("utf-8")) diff --git a/code/planet/cache.py b/code/planet/cache.py index dfc529b7..d6031c28 100644 --- a/code/planet/cache.py +++ b/code/planet/cache.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- +#!/usr/bin/env python3 """Item cache. Between runs of Planet we need somewhere to store the feed information @@ -12,13 +11,17 @@ import os import re - +import shelve +import time +from typing import Any, TypeAlias # Regular expressions to sanitise cache filenames -re_url_scheme = re.compile(r'^[^:]*://') -re_slash = re.compile(r'[?/]+') -re_initial_cruft = re.compile(r'^[,.]*') -re_final_cruft = re.compile(r'[,.]*$') +re_url_scheme = re.compile(r"^[^:]*://") +re_slash = re.compile(r"[?/]+") +re_initial_cruft = re.compile(r"^[,.]*") +re_final_cruft = re.compile(r"[,.]*$") + +TimeTuple: TypeAlias = tuple[int, int, int, int, int, int, int, int, int] class CachedInfo: @@ -33,20 +36,21 @@ class CachedInfo: and implement get_FIELD and set_FIELD functions which will be automatically called. """ + STRING = "string" - DATE = "date" - NULL = "null" + DATE = "date" + NULL = "null" - def __init__(self, cache, id_, root=0): - self._type = {} - self._value = {} - self._cached = {} + def __init__(self, cache: shelve.Shelf[Any], id_, root=False): + self._type: dict[str, str] = {} + self._value: dict[str, Any] = {} + self._cached: dict[str, bool] = {} self._cache = cache self._id = id_.replace(" ", "%20") self._root = root - def cache_key(self, key): + def cache_key(self, key: str) -> str: """Return the cache key name for the given key.""" key = key.replace(" ", "_") if self._root: @@ -54,85 +58,74 @@ def cache_key(self, key): else: return self._id + " " + key - def cache_read(self): + def cache_read(self) -> None: """Read information from the cache.""" - if self._root: - keys_key = " keys" - else: - keys_key = self._id + keys_key = " keys" if self._root else self._id - if self._cache.has_key(keys_key): + if keys_key in self._cache: keys = self._cache[keys_key].split(" ") else: return for key in keys: cache_key = self.cache_key(key) - if not self._cached.has_key(key) or self._cached[key]: + if key not in self._cached or self._cached[key]: # Key either hasn't been loaded, or is one for the cache self._value[key] = self._cache[cache_key] - self._type[key] = self._cache[cache_key + " type"] - self._cached[key] = 1 + self._type[key] = self._cache[f"{cache_key} type"] + self._cached[key] = True - def cache_write(self, sync=1): + def cache_write(self, sync: bool = True): """Write information to the cache.""" - self.cache_clear(sync=0) + self.cache_clear(sync=False) keys = [] for key in self.keys(): cache_key = self.cache_key(key) if not self._cached[key]: - if self._cache.has_key(cache_key): + if cache_key in self._cache: # Non-cached keys need to be cleared - del(self._cache[cache_key]) - del(self._cache[cache_key + " type"]) + del self._cache[cache_key] + del self._cache[f"{cache_key} type"] continue keys.append(key) self._cache[cache_key] = self._value[key] - self._cache[cache_key + " type"] = self._type[key] - - if self._root: - keys_key = " keys" - else: - keys_key = self._id + self._cache[f"{cache_key} type"] = self._type[key] + keys_key = " keys" if self._root else self._id self._cache[keys_key] = " ".join(keys) if sync: self._cache.sync() - def cache_clear(self, sync=1): + def cache_clear(self, sync: bool = True): """Remove information from the cache.""" - if self._root: - keys_key = " keys" - else: - keys_key = self._id + keys_key = " keys" if self._root else self._id - if self._cache.has_key(keys_key): - keys = self._cache[keys_key].split(" ") - del(self._cache[keys_key]) - else: + if keys_key not in self._cache: return + keys = self._cache[keys_key].split(" ") + del self._cache[keys_key] for key in keys: cache_key = self.cache_key(key) - del(self._cache[cache_key]) - del(self._cache[cache_key + " type"]) + del self._cache[cache_key] + del self._cache[f"{cache_key} type"] if sync: self._cache.sync() - def has_key(self, key): + def has_key(self, key: str) -> bool: """Check whether the key exists.""" key = key.replace(" ", "_") - return self._value.has_key(key) + return key in self._value def key_type(self, key): """Return the key type.""" key = key.replace(" ", "_") return self._type[key] - def set(self, key, value, cached=1): + def set(self, key: str, value: Any, cached: bool = True) -> Any: """Set the value of the given key. If a set_KEY function exists that is called otherwise the @@ -148,15 +141,17 @@ def set(self, key, value, cached=1): else: return func(key, value) - if value == None: + if value is None: return self.set_as_null(key, value) + elif isinstance(value, time.struct_time): + return self.set_as_date(key, value) else: try: - return self.set_as_string(key, value) + return self.set_as_string(key, value, cached) except TypeError: - return self.set_as_date(key, value) + return self.set_as_date(key, value, cached) - def get(self, key): + def get(self, key: str) -> Any | None: """Return the value of the given key. If a get_KEY function exists that is called otherwise the @@ -180,7 +175,7 @@ def get(self, key): return self._value[key] - def set_as_string(self, key, value, cached=1): + def set_as_string(self, key, value, cached: bool = True): """Set the key to the string value. The value is converted to UTF-8 if it is a Unicode string, otherwise @@ -197,33 +192,31 @@ def set_as_string(self, key, value, cached=1): def get_as_string(self, key): """Return the key as a string value.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key - + if key not in self._value: + raise KeyError(key) return self._value[key] - def set_as_date(self, key, value, cached=1): + def set_as_date(self, key, value, cached: bool = True): """Set the key to the date value. The date should be a 9-item tuple as returned by time.gmtime(). """ - value = " ".join([ str(s) for s in value ]) + value = " ".join([str(s) for s in value]) key = key.replace(" ", "_") self._value[key] = value self._type[key] = self.DATE self._cached[key] = cached - def get_as_date(self, key): + def get_as_date(self, key: str) -> TimeTuple | None: """Return the key as a date value.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key - + if key not in self._value: + raise KeyError(key) value = self._value[key] - return tuple([ int(i) for i in value.split(" ") ]) + return tuple(int(i) for i in value.split(" ")) - def set_as_null(self, key, value, cached=1): + def set_as_null(self, key, _value, cached: bool = True): """Set the key to the null value. This only exists to make things less magic. @@ -236,20 +229,18 @@ def set_as_null(self, key, value, cached=1): def get_as_null(self, key): """Return the key as the null value.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key - - return None + if key not in self._value: + raise KeyError(key) def del_key(self, key): """Delete the given key.""" key = key.replace(" ", "_") - if not self.has_key(key): - raise KeyError, key + if key not in self._value: + raise KeyError(key) - del(self._value[key]) - del(self._type[key]) - del(self._cached[key]) + del self._value[key] + del self._type[key] + del self._cached[key] def keys(self): """Return the list of cached keys.""" @@ -261,10 +252,10 @@ def __iter__(self): # Special methods __contains__ = has_key - __setitem__ = set_as_string - __getitem__ = get - __delitem__ = del_key - __delattr__ = del_key + __setitem__ = set_as_string + __getitem__ = get + __delitem__ = del_key + __delattr__ = del_key def __setattr__(self, key, value): if key.startswith("_"): @@ -273,10 +264,9 @@ def __setattr__(self, key, value): self.set(key, value) def __getattr__(self, key): - if self.has_key(key): + if key in self._value: return self.get(key) - else: - raise AttributeError, key + raise AttributeError(key) def filename(directory, filename): @@ -292,15 +282,9 @@ def filename(directory, filename): return os.path.join(directory, filename) + def utf8(value): """Return the value as a UTF-8 string.""" - if type(value) == type(u''): - return value.encode("utf-8") - else: - try: - return unicode(value, "utf-8").encode("utf-8") - except UnicodeError: - try: - return unicode(value, "iso-8859-1").encode("utf-8") - except UnicodeError: - return unicode(value, "ascii", "replace").encode("utf-8") + if isinstance(value, str): + return value + return value.decode("utf-8") if isinstance(value, bytes) else str(value) diff --git a/code/planet/compat_logging/__init__.py b/code/planet/compat_logging/__init__.py deleted file mode 100644 index 3bd0c6d7..00000000 --- a/code/planet/compat_logging/__init__.py +++ /dev/null @@ -1,1196 +0,0 @@ -# Copyright 2001-2002 by Vinay Sajip. All Rights Reserved. -# -# Permission to use, copy, modify, and distribute this software and its -# documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appear in all copies and that -# both that copyright notice and this permission notice appear in -# supporting documentation, and that the name of Vinay Sajip -# not be used in advertising or publicity pertaining to distribution -# of the software without specific, written prior permission. -# VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING -# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL -# VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR -# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER -# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -""" -Logging package for Python. Based on PEP 282 and comments thereto in -comp.lang.python, and influenced by Apache's log4j system. - -Should work under Python versions >= 1.5.2, except that source line -information is not available unless 'sys._getframe()' is. - -Copyright (C) 2001-2002 Vinay Sajip. All Rights Reserved. - -To use, simply 'import logging' and log away! -""" - -import sys, os, types, time, string, cStringIO - -try: - import thread - import threading -except ImportError: - thread = None - -__author__ = "Vinay Sajip " -__status__ = "beta" -__version__ = "0.4.8.1" -__date__ = "26 June 2003" - -#--------------------------------------------------------------------------- -# Miscellaneous module data -#--------------------------------------------------------------------------- - -# -#_srcfile is used when walking the stack to check when we've got the first -# caller stack frame. -# -if string.lower(__file__[-4:]) in ['.pyc', '.pyo']: - _srcfile = __file__[:-4] + '.py' -else: - _srcfile = __file__ -_srcfile = os.path.normcase(_srcfile) - -# _srcfile is only used in conjunction with sys._getframe(). -# To provide compatibility with older versions of Python, set _srcfile -# to None if _getframe() is not available; this value will prevent -# findCaller() from being called. -if not hasattr(sys, "_getframe"): - _srcfile = None - -# -#_startTime is used as the base when calculating the relative time of events -# -_startTime = time.time() - -# -#raiseExceptions is used to see if exceptions during handling should be -#propagated -# -raiseExceptions = 1 - -#--------------------------------------------------------------------------- -# Level related stuff -#--------------------------------------------------------------------------- -# -# Default levels and level names, these can be replaced with any positive set -# of values having corresponding names. There is a pseudo-level, NOTSET, which -# is only really there as a lower limit for user-defined levels. Handlers and -# loggers are initialized with NOTSET so that they will log all messages, even -# at user-defined levels. -# -CRITICAL = 50 -FATAL = CRITICAL -ERROR = 40 -WARNING = 30 -WARN = WARNING -INFO = 20 -DEBUG = 10 -NOTSET = 0 - -_levelNames = { - CRITICAL : 'CRITICAL', - ERROR : 'ERROR', - WARNING : 'WARNING', - INFO : 'INFO', - DEBUG : 'DEBUG', - NOTSET : 'NOTSET', - 'CRITICAL' : CRITICAL, - 'ERROR' : ERROR, - 'WARN' : WARNING, - 'WARNING' : WARNING, - 'INFO' : INFO, - 'DEBUG' : DEBUG, - 'NOTSET' : NOTSET, -} - -def getLevelName(level): - """ - Return the textual representation of logging level 'level'. - - If the level is one of the predefined levels (CRITICAL, ERROR, WARNING, - INFO, DEBUG) then you get the corresponding string. If you have - associated levels with names using addLevelName then the name you have - associated with 'level' is returned. Otherwise, the string - "Level %s" % level is returned. - """ - return _levelNames.get(level, ("Level %s" % level)) - -def addLevelName(level, levelName): - """ - Associate 'levelName' with 'level'. - - This is used when converting levels to text during message formatting. - """ - _acquireLock() - try: #unlikely to cause an exception, but you never know... - _levelNames[level] = levelName - _levelNames[levelName] = level - finally: - _releaseLock() - -#--------------------------------------------------------------------------- -# Thread-related stuff -#--------------------------------------------------------------------------- - -# -#_lock is used to serialize access to shared data structures in this module. -#This needs to be an RLock because fileConfig() creates Handlers and so -#might arbitrary user threads. Since Handler.__init__() updates the shared -#dictionary _handlers, it needs to acquire the lock. But if configuring, -#the lock would already have been acquired - so we need an RLock. -#The same argument applies to Loggers and Manager.loggerDict. -# -_lock = None - -def _acquireLock(): - """ - Acquire the module-level lock for serializing access to shared data. - - This should be released with _releaseLock(). - """ - global _lock - if (not _lock) and thread: - _lock = threading.RLock() - if _lock: - _lock.acquire() - -def _releaseLock(): - """ - Release the module-level lock acquired by calling _acquireLock(). - """ - if _lock: - _lock.release() - -#--------------------------------------------------------------------------- -# The logging record -#--------------------------------------------------------------------------- - -class LogRecord: - """ - A LogRecord instance represents an event being logged. - - LogRecord instances are created every time something is logged. They - contain all the information pertinent to the event being logged. The - main information passed in is in msg and args, which are combined - using str(msg) % args to create the message field of the record. The - record also includes information such as when the record was created, - the source line where the logging call was made, and any exception - information to be logged. - """ - def __init__(self, name, level, pathname, lineno, msg, args, exc_info): - """ - Initialize a logging record with interesting information. - """ - ct = time.time() - self.name = name - self.msg = msg - self.args = args - self.levelname = getLevelName(level) - self.levelno = level - self.pathname = pathname - try: - self.filename = os.path.basename(pathname) - self.module = os.path.splitext(self.filename)[0] - except: - self.filename = pathname - self.module = "Unknown module" - self.exc_info = exc_info - self.lineno = lineno - self.created = ct - self.msecs = (ct - long(ct)) * 1000 - self.relativeCreated = (self.created - _startTime) * 1000 - if thread: - self.thread = thread.get_ident() - else: - self.thread = None - if hasattr(os, 'getpid'): - self.process = os.getpid() - else: - self.process = None - - def __str__(self): - return ''%(self.name, self.levelno, - self.pathname, self.lineno, self.msg) - - def getMessage(self): - """ - Return the message for this LogRecord. - - Return the message for this LogRecord after merging any user-supplied - arguments with the message. - """ - if not hasattr(types, "UnicodeType"): #if no unicode support... - msg = str(self.msg) - else: - try: - msg = str(self.msg) - except UnicodeError: - msg = self.msg #Defer encoding till later - if self.args: - msg = msg % self.args - return msg - -def makeLogRecord(dict): - """ - Make a LogRecord whose attributes are defined by the specified dictionary, - This function is useful for converting a logging event received over - a socket connection (which is sent as a dictionary) into a LogRecord - instance. - """ - rv = LogRecord(None, None, "", 0, "", (), None) - rv.__dict__.update(dict) - return rv - -#--------------------------------------------------------------------------- -# Formatter classes and functions -#--------------------------------------------------------------------------- - -class Formatter: - """ - Formatter instances are used to convert a LogRecord to text. - - Formatters need to know how a LogRecord is constructed. They are - responsible for converting a LogRecord to (usually) a string which can - be interpreted by either a human or an external system. The base Formatter - allows a formatting string to be specified. If none is supplied, the - default value of "%s(message)\\n" is used. - - The Formatter can be initialized with a format string which makes use of - knowledge of the LogRecord attributes - e.g. the default value mentioned - above makes use of the fact that the user's message and arguments are pre- - formatted into a LogRecord's message attribute. Currently, the useful - attributes in a LogRecord are described by: - - %(name)s Name of the logger (logging channel) - %(levelno)s Numeric logging level for the message (DEBUG, INFO, - WARNING, ERROR, CRITICAL) - %(levelname)s Text logging level for the message ("DEBUG", "INFO", - "WARNING", "ERROR", "CRITICAL") - %(pathname)s Full pathname of the source file where the logging - call was issued (if available) - %(filename)s Filename portion of pathname - %(module)s Module (name portion of filename) - %(lineno)d Source line number where the logging call was issued - (if available) - %(created)f Time when the LogRecord was created (time.time() - return value) - %(asctime)s Textual time when the LogRecord was created - %(msecs)d Millisecond portion of the creation time - %(relativeCreated)d Time in milliseconds when the LogRecord was created, - relative to the time the logging module was loaded - (typically at application startup time) - %(thread)d Thread ID (if available) - %(process)d Process ID (if available) - %(message)s The result of record.getMessage(), computed just as - the record is emitted - """ - - converter = time.localtime - - def __init__(self, fmt=None, datefmt=None): - """ - Initialize the formatter with specified format strings. - - Initialize the formatter either with the specified format string, or a - default as described above. Allow for specialized date formatting with - the optional datefmt argument (if omitted, you get the ISO8601 format). - """ - if fmt: - self._fmt = fmt - else: - self._fmt = "%(message)s" - self.datefmt = datefmt - - def formatTime(self, record, datefmt=None): - """ - Return the creation time of the specified LogRecord as formatted text. - - This method should be called from format() by a formatter which - wants to make use of a formatted time. This method can be overridden - in formatters to provide for any specific requirement, but the - basic behaviour is as follows: if datefmt (a string) is specified, - it is used with time.strftime() to format the creation time of the - record. Otherwise, the ISO8601 format is used. The resulting - string is returned. This function uses a user-configurable function - to convert the creation time to a tuple. By default, time.localtime() - is used; to change this for a particular formatter instance, set the - 'converter' attribute to a function with the same signature as - time.localtime() or time.gmtime(). To change it for all formatters, - for example if you want all logging times to be shown in GMT, - set the 'converter' attribute in the Formatter class. - """ - ct = self.converter(record.created) - if datefmt: - s = time.strftime(datefmt, ct) - else: - t = time.strftime("%Y-%m-%d %H:%M:%S", ct) - s = "%s,%03d" % (t, record.msecs) - return s - - def formatException(self, ei): - """ - Format and return the specified exception information as a string. - - This default implementation just uses - traceback.print_exception() - """ - import traceback - sio = cStringIO.StringIO() - traceback.print_exception(ei[0], ei[1], ei[2], None, sio) - s = sio.getvalue() - sio.close() - if s[-1] == "\n": - s = s[:-1] - return s - - def format(self, record): - """ - Format the specified record as text. - - The record's attribute dictionary is used as the operand to a - string formatting operation which yields the returned string. - Before formatting the dictionary, a couple of preparatory steps - are carried out. The message attribute of the record is computed - using LogRecord.getMessage(). If the formatting string contains - "%(asctime)", formatTime() is called to format the event time. - If there is exception information, it is formatted using - formatException() and appended to the message. - """ - record.message = record.getMessage() - if string.find(self._fmt,"%(asctime)") >= 0: - record.asctime = self.formatTime(record, self.datefmt) - s = self._fmt % record.__dict__ - if record.exc_info: - if s[-1] != "\n": - s = s + "\n" - s = s + self.formatException(record.exc_info) - return s - -# -# The default formatter to use when no other is specified -# -_defaultFormatter = Formatter() - -class BufferingFormatter: - """ - A formatter suitable for formatting a number of records. - """ - def __init__(self, linefmt=None): - """ - Optionally specify a formatter which will be used to format each - individual record. - """ - if linefmt: - self.linefmt = linefmt - else: - self.linefmt = _defaultFormatter - - def formatHeader(self, records): - """ - Return the header string for the specified records. - """ - return "" - - def formatFooter(self, records): - """ - Return the footer string for the specified records. - """ - return "" - - def format(self, records): - """ - Format the specified records and return the result as a string. - """ - rv = "" - if len(records) > 0: - rv = rv + self.formatHeader(records) - for record in records: - rv = rv + self.linefmt.format(record) - rv = rv + self.formatFooter(records) - return rv - -#--------------------------------------------------------------------------- -# Filter classes and functions -#--------------------------------------------------------------------------- - -class Filter: - """ - Filter instances are used to perform arbitrary filtering of LogRecords. - - Loggers and Handlers can optionally use Filter instances to filter - records as desired. The base filter class only allows events which are - below a certain point in the logger hierarchy. For example, a filter - initialized with "A.B" will allow events logged by loggers "A.B", - "A.B.C", "A.B.C.D", "A.B.D" etc. but not "A.BB", "B.A.B" etc. If - initialized with the empty string, all events are passed. - """ - def __init__(self, name=''): - """ - Initialize a filter. - - Initialize with the name of the logger which, together with its - children, will have its events allowed through the filter. If no - name is specified, allow every event. - """ - self.name = name - self.nlen = len(name) - - def filter(self, record): - """ - Determine if the specified record is to be logged. - - Is the specified record to be logged? Returns 0 for no, nonzero for - yes. If deemed appropriate, the record may be modified in-place. - """ - if self.nlen == 0: - return 1 - elif self.name == record.name: - return 1 - elif string.find(record.name, self.name, 0, self.nlen) != 0: - return 0 - return (record.name[self.nlen] == ".") - -class Filterer: - """ - A base class for loggers and handlers which allows them to share - common code. - """ - def __init__(self): - """ - Initialize the list of filters to be an empty list. - """ - self.filters = [] - - def addFilter(self, filter): - """ - Add the specified filter to this handler. - """ - if not (filter in self.filters): - self.filters.append(filter) - - def removeFilter(self, filter): - """ - Remove the specified filter from this handler. - """ - if filter in self.filters: - self.filters.remove(filter) - - def filter(self, record): - """ - Determine if a record is loggable by consulting all the filters. - - The default is to allow the record to be logged; any filter can veto - this and the record is then dropped. Returns a zero value if a record - is to be dropped, else non-zero. - """ - rv = 1 - for f in self.filters: - if not f.filter(record): - rv = 0 - break - return rv - -#--------------------------------------------------------------------------- -# Handler classes and functions -#--------------------------------------------------------------------------- - -_handlers = {} #repository of handlers (for flushing when shutdown called) - -class Handler(Filterer): - """ - Handler instances dispatch logging events to specific destinations. - - The base handler class. Acts as a placeholder which defines the Handler - interface. Handlers can optionally use Formatter instances to format - records as desired. By default, no formatter is specified; in this case, - the 'raw' message as determined by record.message is logged. - """ - def __init__(self, level=NOTSET): - """ - Initializes the instance - basically setting the formatter to None - and the filter list to empty. - """ - Filterer.__init__(self) - self.level = level - self.formatter = None - #get the module data lock, as we're updating a shared structure. - _acquireLock() - try: #unlikely to raise an exception, but you never know... - _handlers[self] = 1 - finally: - _releaseLock() - self.createLock() - - def createLock(self): - """ - Acquire a thread lock for serializing access to the underlying I/O. - """ - if thread: - self.lock = thread.allocate_lock() - else: - self.lock = None - - def acquire(self): - """ - Acquire the I/O thread lock. - """ - if self.lock: - self.lock.acquire() - - def release(self): - """ - Release the I/O thread lock. - """ - if self.lock: - self.lock.release() - - def setLevel(self, level): - """ - Set the logging level of this handler. - """ - self.level = level - - def format(self, record): - """ - Format the specified record. - - If a formatter is set, use it. Otherwise, use the default formatter - for the module. - """ - if self.formatter: - fmt = self.formatter - else: - fmt = _defaultFormatter - return fmt.format(record) - - def emit(self, record): - """ - Do whatever it takes to actually log the specified logging record. - - This version is intended to be implemented by subclasses and so - raises a NotImplementedError. - """ - raise NotImplementedError, 'emit must be implemented '\ - 'by Handler subclasses' - - def handle(self, record): - """ - Conditionally emit the specified logging record. - - Emission depends on filters which may have been added to the handler. - Wrap the actual emission of the record with acquisition/release of - the I/O thread lock. Returns whether the filter passed the record for - emission. - """ - rv = self.filter(record) - if rv: - self.acquire() - try: - self.emit(record) - finally: - self.release() - return rv - - def setFormatter(self, fmt): - """ - Set the formatter for this handler. - """ - self.formatter = fmt - - def flush(self): - """ - Ensure all logging output has been flushed. - - This version does nothing and is intended to be implemented by - subclasses. - """ - pass - - def close(self): - """ - Tidy up any resources used by the handler. - - This version does nothing and is intended to be implemented by - subclasses. - """ - pass - - def handleError(self, record): - """ - Handle errors which occur during an emit() call. - - This method should be called from handlers when an exception is - encountered during an emit() call. If raiseExceptions is false, - exceptions get silently ignored. This is what is mostly wanted - for a logging system - most users will not care about errors in - the logging system, they are more interested in application errors. - You could, however, replace this with a custom handler if you wish. - The record which was being processed is passed in to this method. - """ - if raiseExceptions: - import traceback - ei = sys.exc_info() - traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) - del ei - -class StreamHandler(Handler): - """ - A handler class which writes logging records, appropriately formatted, - to a stream. Note that this class does not close the stream, as - sys.stdout or sys.stderr may be used. - """ - def __init__(self, strm=None): - """ - Initialize the handler. - - If strm is not specified, sys.stderr is used. - """ - Handler.__init__(self) - if not strm: - strm = sys.stderr - self.stream = strm - self.formatter = None - - def flush(self): - """ - Flushes the stream. - """ - self.stream.flush() - - def emit(self, record): - """ - Emit a record. - - If a formatter is specified, it is used to format the record. - The record is then written to the stream with a trailing newline - [N.B. this may be removed depending on feedback]. If exception - information is present, it is formatted using - traceback.print_exception and appended to the stream. - """ - try: - msg = self.format(record) - if not hasattr(types, "UnicodeType"): #if no unicode support... - self.stream.write("%s\n" % msg) - else: - try: - self.stream.write("%s\n" % msg) - except UnicodeError: - self.stream.write("%s\n" % msg.encode("UTF-8")) - self.flush() - except: - self.handleError(record) - -class FileHandler(StreamHandler): - """ - A handler class which writes formatted logging records to disk files. - """ - def __init__(self, filename, mode="a"): - """ - Open the specified file and use it as the stream for logging. - """ - StreamHandler.__init__(self, open(filename, mode)) - self.baseFilename = filename - self.mode = mode - - def close(self): - """ - Closes the stream. - """ - self.stream.close() - -#--------------------------------------------------------------------------- -# Manager classes and functions -#--------------------------------------------------------------------------- - -class PlaceHolder: - """ - PlaceHolder instances are used in the Manager logger hierarchy to take - the place of nodes for which no loggers have been defined [FIXME add - example]. - """ - def __init__(self, alogger): - """ - Initialize with the specified logger being a child of this placeholder. - """ - self.loggers = [alogger] - - def append(self, alogger): - """ - Add the specified logger as a child of this placeholder. - """ - if alogger not in self.loggers: - self.loggers.append(alogger) - -# -# Determine which class to use when instantiating loggers. -# -_loggerClass = None - -def setLoggerClass(klass): - """ - Set the class to be used when instantiating a logger. The class should - define __init__() such that only a name argument is required, and the - __init__() should call Logger.__init__() - """ - if klass != Logger: - if not issubclass(klass, Logger): - raise TypeError, "logger not derived from logging.Logger: " + \ - klass.__name__ - global _loggerClass - _loggerClass = klass - -class Manager: - """ - There is [under normal circumstances] just one Manager instance, which - holds the hierarchy of loggers. - """ - def __init__(self, rootnode): - """ - Initialize the manager with the root node of the logger hierarchy. - """ - self.root = rootnode - self.disable = 0 - self.emittedNoHandlerWarning = 0 - self.loggerDict = {} - - def getLogger(self, name): - """ - Get a logger with the specified name (channel name), creating it - if it doesn't yet exist. - - If a PlaceHolder existed for the specified name [i.e. the logger - didn't exist but a child of it did], replace it with the created - logger and fix up the parent/child references which pointed to the - placeholder to now point to the logger. - """ - rv = None - _acquireLock() - try: - if self.loggerDict.has_key(name): - rv = self.loggerDict[name] - if isinstance(rv, PlaceHolder): - ph = rv - rv = _loggerClass(name) - rv.manager = self - self.loggerDict[name] = rv - self._fixupChildren(ph, rv) - self._fixupParents(rv) - else: - rv = _loggerClass(name) - rv.manager = self - self.loggerDict[name] = rv - self._fixupParents(rv) - finally: - _releaseLock() - return rv - - def _fixupParents(self, alogger): - """ - Ensure that there are either loggers or placeholders all the way - from the specified logger to the root of the logger hierarchy. - """ - name = alogger.name - i = string.rfind(name, ".") - rv = None - while (i > 0) and not rv: - substr = name[:i] - if not self.loggerDict.has_key(substr): - self.loggerDict[substr] = PlaceHolder(alogger) - else: - obj = self.loggerDict[substr] - if isinstance(obj, Logger): - rv = obj - else: - assert isinstance(obj, PlaceHolder) - obj.append(alogger) - i = string.rfind(name, ".", 0, i - 1) - if not rv: - rv = self.root - alogger.parent = rv - - def _fixupChildren(self, ph, alogger): - """ - Ensure that children of the placeholder ph are connected to the - specified logger. - """ - for c in ph.loggers: - if string.find(c.parent.name, alogger.name) <> 0: - alogger.parent = c.parent - c.parent = alogger - -#--------------------------------------------------------------------------- -# Logger classes and functions -#--------------------------------------------------------------------------- - -class Logger(Filterer): - """ - Instances of the Logger class represent a single logging channel. A - "logging channel" indicates an area of an application. Exactly how an - "area" is defined is up to the application developer. Since an - application can have any number of areas, logging channels are identified - by a unique string. Application areas can be nested (e.g. an area - of "input processing" might include sub-areas "read CSV files", "read - XLS files" and "read Gnumeric files"). To cater for this natural nesting, - channel names are organized into a namespace hierarchy where levels are - separated by periods, much like the Java or Python package namespace. So - in the instance given above, channel names might be "input" for the upper - level, and "input.csv", "input.xls" and "input.gnu" for the sub-levels. - There is no arbitrary limit to the depth of nesting. - """ - def __init__(self, name, level=NOTSET): - """ - Initialize the logger with a name and an optional level. - """ - Filterer.__init__(self) - self.name = name - self.level = level - self.parent = None - self.propagate = 1 - self.handlers = [] - self.disabled = 0 - - def setLevel(self, level): - """ - Set the logging level of this logger. - """ - self.level = level - -# def getRoot(self): -# """ -# Get the root of the logger hierarchy. -# """ -# return Logger.root - - def debug(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'DEBUG'. - - To pass exception information, use the keyword argument exc_info with - a true value, e.g. - - logger.debug("Houston, we have a %s", "thorny problem", exc_info=1) - """ - if self.manager.disable >= DEBUG: - return - if DEBUG >= self.getEffectiveLevel(): - apply(self._log, (DEBUG, msg, args), kwargs) - - def info(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'INFO'. - - To pass exception information, use the keyword argument exc_info with - a true value, e.g. - - logger.info("Houston, we have a %s", "interesting problem", exc_info=1) - """ - if self.manager.disable >= INFO: - return - if INFO >= self.getEffectiveLevel(): - apply(self._log, (INFO, msg, args), kwargs) - - def warning(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'WARNING'. - - To pass exception information, use the keyword argument exc_info with - a true value, e.g. - - logger.warning("Houston, we have a %s", "bit of a problem", exc_info=1) - """ - if self.manager.disable >= WARNING: - return - if self.isEnabledFor(WARNING): - apply(self._log, (WARNING, msg, args), kwargs) - - warn = warning - - def error(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'ERROR'. - - To pass exception information, use the keyword argument exc_info with - a true value, e.g. - - logger.error("Houston, we have a %s", "major problem", exc_info=1) - """ - if self.manager.disable >= ERROR: - return - if self.isEnabledFor(ERROR): - apply(self._log, (ERROR, msg, args), kwargs) - - def exception(self, msg, *args): - """ - Convenience method for logging an ERROR with exception information. - """ - apply(self.error, (msg,) + args, {'exc_info': 1}) - - def critical(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'CRITICAL'. - - To pass exception information, use the keyword argument exc_info with - a true value, e.g. - - logger.critical("Houston, we have a %s", "major disaster", exc_info=1) - """ - if self.manager.disable >= CRITICAL: - return - if CRITICAL >= self.getEffectiveLevel(): - apply(self._log, (CRITICAL, msg, args), kwargs) - - fatal = critical - - def log(self, level, msg, *args, **kwargs): - """ - Log 'msg % args' with the severity 'level'. - - To pass exception information, use the keyword argument exc_info with - a true value, e.g. - - logger.log(level, "We have a %s", "mysterious problem", exc_info=1) - """ - if self.manager.disable >= level: - return - if self.isEnabledFor(level): - apply(self._log, (level, msg, args), kwargs) - - def findCaller(self): - """ - Find the stack frame of the caller so that we can note the source - file name and line number. - """ - f = sys._getframe(1) - while 1: - co = f.f_code - filename = os.path.normcase(co.co_filename) - if filename == _srcfile: - f = f.f_back - continue - return filename, f.f_lineno - - def makeRecord(self, name, level, fn, lno, msg, args, exc_info): - """ - A factory method which can be overridden in subclasses to create - specialized LogRecords. - """ - return LogRecord(name, level, fn, lno, msg, args, exc_info) - - def _log(self, level, msg, args, exc_info=None): - """ - Low-level logging routine which creates a LogRecord and then calls - all the handlers of this logger to handle the record. - """ - if _srcfile: - fn, lno = self.findCaller() - else: - fn, lno = "", 0 - if exc_info: - exc_info = sys.exc_info() - record = self.makeRecord(self.name, level, fn, lno, msg, args, exc_info) - self.handle(record) - - def handle(self, record): - """ - Call the handlers for the specified record. - - This method is used for unpickled records received from a socket, as - well as those created locally. Logger-level filtering is applied. - """ - if (not self.disabled) and self.filter(record): - self.callHandlers(record) - - def addHandler(self, hdlr): - """ - Add the specified handler to this logger. - """ - if not (hdlr in self.handlers): - self.handlers.append(hdlr) - - def removeHandler(self, hdlr): - """ - Remove the specified handler from this logger. - """ - if hdlr in self.handlers: - #hdlr.close() - self.handlers.remove(hdlr) - - def callHandlers(self, record): - """ - Pass a record to all relevant handlers. - - Loop through all handlers for this logger and its parents in the - logger hierarchy. If no handler was found, output a one-off error - message to sys.stderr. Stop searching up the hierarchy whenever a - logger with the "propagate" attribute set to zero is found - that - will be the last logger whose handlers are called. - """ - c = self - found = 0 - while c: - for hdlr in c.handlers: - found = found + 1 - if record.levelno >= hdlr.level: - hdlr.handle(record) - if not c.propagate: - c = None #break out - else: - c = c.parent - if (found == 0) and not self.manager.emittedNoHandlerWarning: - sys.stderr.write("No handlers could be found for logger" - " \"%s\"\n" % self.name) - self.manager.emittedNoHandlerWarning = 1 - - def getEffectiveLevel(self): - """ - Get the effective level for this logger. - - Loop through this logger and its parents in the logger hierarchy, - looking for a non-zero logging level. Return the first one found. - """ - logger = self - while logger: - if logger.level: - return logger.level - logger = logger.parent - return NOTSET - - def isEnabledFor(self, level): - """ - Is this logger enabled for level 'level'? - """ - if self.manager.disable >= level: - return 0 - return level >= self.getEffectiveLevel() - -class RootLogger(Logger): - """ - A root logger is not that different to any other logger, except that - it must have a logging level and there is only one instance of it in - the hierarchy. - """ - def __init__(self, level): - """ - Initialize the logger with the name "root". - """ - Logger.__init__(self, "root", level) - -_loggerClass = Logger - -root = RootLogger(WARNING) -Logger.root = root -Logger.manager = Manager(Logger.root) - -#--------------------------------------------------------------------------- -# Configuration classes and functions -#--------------------------------------------------------------------------- - -BASIC_FORMAT = "%(levelname)s:%(name)s:%(message)s" - -def basicConfig(): - """ - Do basic configuration for the logging system by creating a - StreamHandler with a default Formatter and adding it to the - root logger. - """ - if len(root.handlers) == 0: - hdlr = StreamHandler() - fmt = Formatter(BASIC_FORMAT) - hdlr.setFormatter(fmt) - root.addHandler(hdlr) - -#--------------------------------------------------------------------------- -# Utility functions at module level. -# Basically delegate everything to the root logger. -#--------------------------------------------------------------------------- - -def getLogger(name=None): - """ - Return a logger with the specified name, creating it if necessary. - - If no name is specified, return the root logger. - """ - if name: - return Logger.manager.getLogger(name) - else: - return root - -#def getRootLogger(): -# """ -# Return the root logger. -# -# Note that getLogger('') now does the same thing, so this function is -# deprecated and may disappear in the future. -# """ -# return root - -def critical(msg, *args, **kwargs): - """ - Log a message with severity 'CRITICAL' on the root logger. - """ - if len(root.handlers) == 0: - basicConfig() - apply(root.critical, (msg,)+args, kwargs) - -fatal = critical - -def error(msg, *args, **kwargs): - """ - Log a message with severity 'ERROR' on the root logger. - """ - if len(root.handlers) == 0: - basicConfig() - apply(root.error, (msg,)+args, kwargs) - -def exception(msg, *args): - """ - Log a message with severity 'ERROR' on the root logger, - with exception information. - """ - apply(error, (msg,)+args, {'exc_info': 1}) - -def warning(msg, *args, **kwargs): - """ - Log a message with severity 'WARNING' on the root logger. - """ - if len(root.handlers) == 0: - basicConfig() - apply(root.warning, (msg,)+args, kwargs) - -warn = warning - -def info(msg, *args, **kwargs): - """ - Log a message with severity 'INFO' on the root logger. - """ - if len(root.handlers) == 0: - basicConfig() - apply(root.info, (msg,)+args, kwargs) - -def debug(msg, *args, **kwargs): - """ - Log a message with severity 'DEBUG' on the root logger. - """ - if len(root.handlers) == 0: - basicConfig() - apply(root.debug, (msg,)+args, kwargs) - -def disable(level): - """ - Disable all logging calls less severe than 'level'. - """ - root.manager.disable = level - -def shutdown(): - """ - Perform any cleanup actions in the logging system (e.g. flushing - buffers). - - Should be called at application exit. - """ - for h in _handlers.keys(): - h.flush() - h.close() diff --git a/code/planet/compat_logging/config.py b/code/planet/compat_logging/config.py deleted file mode 100644 index d4d08f01..00000000 --- a/code/planet/compat_logging/config.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright 2001-2002 by Vinay Sajip. All Rights Reserved. -# -# Permission to use, copy, modify, and distribute this software and its -# documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appear in all copies and that -# both that copyright notice and this permission notice appear in -# supporting documentation, and that the name of Vinay Sajip -# not be used in advertising or publicity pertaining to distribution -# of the software without specific, written prior permission. -# VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING -# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL -# VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR -# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER -# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -""" -Logging package for Python. Based on PEP 282 and comments thereto in -comp.lang.python, and influenced by Apache's log4j system. - -Should work under Python versions >= 1.5.2, except that source line -information is not available unless 'inspect' is. - -Copyright (C) 2001-2002 Vinay Sajip. All Rights Reserved. - -To use, simply 'import logging' and log away! -""" - -import sys, logging, logging.handlers, string, thread, threading, socket, struct, os - -from SocketServer import ThreadingTCPServer, StreamRequestHandler - - -DEFAULT_LOGGING_CONFIG_PORT = 9030 -if sys.platform == "win32": - RESET_ERROR = 10054 #WSAECONNRESET -else: - RESET_ERROR = 104 #ECONNRESET - -# -# The following code implements a socket listener for on-the-fly -# reconfiguration of logging. -# -# _listener holds the server object doing the listening -_listener = None - -def fileConfig(fname, defaults=None): - """ - Read the logging configuration from a ConfigParser-format file. - - This can be called several times from an application, allowing an end user - the ability to select from various pre-canned configurations (if the - developer provides a mechanism to present the choices and load the chosen - configuration). - In versions of ConfigParser which have the readfp method [typically - shipped in 2.x versions of Python], you can pass in a file-like object - rather than a filename, in which case the file-like object will be read - using readfp. - """ - import ConfigParser - - cp = ConfigParser.ConfigParser(defaults) - if hasattr(cp, 'readfp') and hasattr(fname, 'readline'): - cp.readfp(fname) - else: - cp.read(fname) - #first, do the formatters... - flist = cp.get("formatters", "keys") - if len(flist): - flist = string.split(flist, ",") - formatters = {} - for form in flist: - sectname = "formatter_%s" % form - opts = cp.options(sectname) - if "format" in opts: - fs = cp.get(sectname, "format", 1) - else: - fs = None - if "datefmt" in opts: - dfs = cp.get(sectname, "datefmt", 1) - else: - dfs = None - f = logging.Formatter(fs, dfs) - formatters[form] = f - #next, do the handlers... - #critical section... - logging._acquireLock() - try: - try: - #first, lose the existing handlers... - logging._handlers.clear() - #now set up the new ones... - hlist = cp.get("handlers", "keys") - if len(hlist): - hlist = string.split(hlist, ",") - handlers = {} - fixups = [] #for inter-handler references - for hand in hlist: - sectname = "handler_%s" % hand - klass = cp.get(sectname, "class") - opts = cp.options(sectname) - if "formatter" in opts: - fmt = cp.get(sectname, "formatter") - else: - fmt = "" - klass = eval(klass, vars(logging)) - args = cp.get(sectname, "args") - args = eval(args, vars(logging)) - h = apply(klass, args) - if "level" in opts: - level = cp.get(sectname, "level") - h.setLevel(logging._levelNames[level]) - if len(fmt): - h.setFormatter(formatters[fmt]) - #temporary hack for FileHandler and MemoryHandler. - if klass == logging.handlers.MemoryHandler: - if "target" in opts: - target = cp.get(sectname,"target") - else: - target = "" - if len(target): #the target handler may not be loaded yet, so keep for later... - fixups.append((h, target)) - handlers[hand] = h - #now all handlers are loaded, fixup inter-handler references... - for fixup in fixups: - h = fixup[0] - t = fixup[1] - h.setTarget(handlers[t]) - #at last, the loggers...first the root... - llist = cp.get("loggers", "keys") - llist = string.split(llist, ",") - llist.remove("root") - sectname = "logger_root" - root = logging.root - log = root - opts = cp.options(sectname) - if "level" in opts: - level = cp.get(sectname, "level") - log.setLevel(logging._levelNames[level]) - for h in root.handlers[:]: - root.removeHandler(h) - hlist = cp.get(sectname, "handlers") - if len(hlist): - hlist = string.split(hlist, ",") - for hand in hlist: - log.addHandler(handlers[hand]) - #and now the others... - #we don't want to lose the existing loggers, - #since other threads may have pointers to them. - #existing is set to contain all existing loggers, - #and as we go through the new configuration we - #remove any which are configured. At the end, - #what's left in existing is the set of loggers - #which were in the previous configuration but - #which are not in the new configuration. - existing = root.manager.loggerDict.keys() - #now set up the new ones... - for log in llist: - sectname = "logger_%s" % log - qn = cp.get(sectname, "qualname") - opts = cp.options(sectname) - if "propagate" in opts: - propagate = cp.getint(sectname, "propagate") - else: - propagate = 1 - logger = logging.getLogger(qn) - if qn in existing: - existing.remove(qn) - if "level" in opts: - level = cp.get(sectname, "level") - logger.setLevel(logging._levelNames[level]) - for h in logger.handlers[:]: - logger.removeHandler(h) - logger.propagate = propagate - logger.disabled = 0 - hlist = cp.get(sectname, "handlers") - if len(hlist): - hlist = string.split(hlist, ",") - for hand in hlist: - logger.addHandler(handlers[hand]) - #Disable any old loggers. There's no point deleting - #them as other threads may continue to hold references - #and by disabling them, you stop them doing any logging. - for log in existing: - root.manager.loggerDict[log].disabled = 1 - except: - import traceback - ei = sys.exc_info() - traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) - del ei - finally: - logging._releaseLock() - -def listen(port=DEFAULT_LOGGING_CONFIG_PORT): - """ - Start up a socket server on the specified port, and listen for new - configurations. - - These will be sent as a file suitable for processing by fileConfig(). - Returns a Thread object on which you can call start() to start the server, - and which you can join() when appropriate. To stop the server, call - stopListening(). - """ - if not thread: - raise NotImplementedError, "listen() needs threading to work" - - class ConfigStreamHandler(StreamRequestHandler): - """ - Handler for a logging configuration request. - - It expects a completely new logging configuration and uses fileConfig - to install it. - """ - def handle(self): - """ - Handle a request. - - Each request is expected to be a 4-byte length, - followed by the config file. Uses fileConfig() to do the - grunt work. - """ - import tempfile - try: - conn = self.connection - chunk = conn.recv(4) - if len(chunk) == 4: - slen = struct.unpack(">L", chunk)[0] - chunk = self.connection.recv(slen) - while len(chunk) < slen: - chunk = chunk + conn.recv(slen - len(chunk)) - #Apply new configuration. We'd like to be able to - #create a StringIO and pass that in, but unfortunately - #1.5.2 ConfigParser does not support reading file - #objects, only actual files. So we create a temporary - #file and remove it later. - file = tempfile.mktemp(".ini") - f = open(file, "w") - f.write(chunk) - f.close() - fileConfig(file) - os.remove(file) - except socket.error, e: - if type(e.args) != types.TupleType: - raise - else: - errcode = e.args[0] - if errcode != RESET_ERROR: - raise - - class ConfigSocketReceiver(ThreadingTCPServer): - """ - A simple TCP socket-based logging config receiver. - """ - - allow_reuse_address = 1 - - def __init__(self, host='localhost', port=DEFAULT_LOGGING_CONFIG_PORT, - handler=None): - ThreadingTCPServer.__init__(self, (host, port), handler) - logging._acquireLock() - self.abort = 0 - logging._releaseLock() - self.timeout = 1 - - def serve_until_stopped(self): - import select - abort = 0 - while not abort: - rd, wr, ex = select.select([self.socket.fileno()], - [], [], - self.timeout) - if rd: - self.handle_request() - logging._acquireLock() - abort = self.abort - logging._releaseLock() - - def serve(rcvr, hdlr, port): - server = rcvr(port=port, handler=hdlr) - global _listener - logging._acquireLock() - _listener = server - logging._releaseLock() - server.serve_until_stopped() - - return threading.Thread(target=serve, - args=(ConfigSocketReceiver, - ConfigStreamHandler, port)) - -def stopListening(): - """ - Stop the listening server which was created with a call to listen(). - """ - global _listener - if _listener: - logging._acquireLock() - _listener.abort = 1 - _listener = None - logging._releaseLock() diff --git a/code/planet/compat_logging/handlers.py b/code/planet/compat_logging/handlers.py deleted file mode 100644 index 26ca8adc..00000000 --- a/code/planet/compat_logging/handlers.py +++ /dev/null @@ -1,728 +0,0 @@ -# Copyright 2001-2002 by Vinay Sajip. All Rights Reserved. -# -# Permission to use, copy, modify, and distribute this software and its -# documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appear in all copies and that -# both that copyright notice and this permission notice appear in -# supporting documentation, and that the name of Vinay Sajip -# not be used in advertising or publicity pertaining to distribution -# of the software without specific, written prior permission. -# VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING -# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL -# VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR -# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER -# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -""" -Logging package for Python. Based on PEP 282 and comments thereto in -comp.lang.python, and influenced by Apache's log4j system. - -Should work under Python versions >= 1.5.2, except that source line -information is not available unless 'inspect' is. - -Copyright (C) 2001-2002 Vinay Sajip. All Rights Reserved. - -To use, simply 'import logging' and log away! -""" - -import sys, logging, socket, types, os, string, cPickle, struct, time - -from SocketServer import ThreadingTCPServer, StreamRequestHandler - -# -# Some constants... -# - -DEFAULT_TCP_LOGGING_PORT = 9020 -DEFAULT_UDP_LOGGING_PORT = 9021 -DEFAULT_HTTP_LOGGING_PORT = 9022 -DEFAULT_SOAP_LOGGING_PORT = 9023 -SYSLOG_UDP_PORT = 514 - - -class RotatingFileHandler(logging.FileHandler): - def __init__(self, filename, mode="a", maxBytes=0, backupCount=0): - """ - Open the specified file and use it as the stream for logging. - - By default, the file grows indefinitely. You can specify particular - values of maxBytes and backupCount to allow the file to rollover at - a predetermined size. - - Rollover occurs whenever the current log file is nearly maxBytes in - length. If backupCount is >= 1, the system will successively create - new files with the same pathname as the base file, but with extensions - ".1", ".2" etc. appended to it. For example, with a backupCount of 5 - and a base file name of "app.log", you would get "app.log", - "app.log.1", "app.log.2", ... through to "app.log.5". The file being - written to is always "app.log" - when it gets filled up, it is closed - and renamed to "app.log.1", and if files "app.log.1", "app.log.2" etc. - exist, then they are renamed to "app.log.2", "app.log.3" etc. - respectively. - - If maxBytes is zero, rollover never occurs. - """ - logging.FileHandler.__init__(self, filename, mode) - self.maxBytes = maxBytes - self.backupCount = backupCount - if maxBytes > 0: - self.mode = "a" - - def doRollover(self): - """ - Do a rollover, as described in __init__(). - """ - - self.stream.close() - if self.backupCount > 0: - for i in range(self.backupCount - 1, 0, -1): - sfn = "%s.%d" % (self.baseFilename, i) - dfn = "%s.%d" % (self.baseFilename, i + 1) - if os.path.exists(sfn): - #print "%s -> %s" % (sfn, dfn) - if os.path.exists(dfn): - os.remove(dfn) - os.rename(sfn, dfn) - dfn = self.baseFilename + ".1" - if os.path.exists(dfn): - os.remove(dfn) - os.rename(self.baseFilename, dfn) - #print "%s -> %s" % (self.baseFilename, dfn) - self.stream = open(self.baseFilename, "w") - - def emit(self, record): - """ - Emit a record. - - Output the record to the file, catering for rollover as described - in doRollover(). - """ - if self.maxBytes > 0: # are we rolling over? - msg = "%s\n" % self.format(record) - self.stream.seek(0, 2) #due to non-posix-compliant Windows feature - if self.stream.tell() + len(msg) >= self.maxBytes: - self.doRollover() - logging.FileHandler.emit(self, record) - - -class SocketHandler(logging.Handler): - """ - A handler class which writes logging records, in pickle format, to - a streaming socket. The socket is kept open across logging calls. - If the peer resets it, an attempt is made to reconnect on the next call. - The pickle which is sent is that of the LogRecord's attribute dictionary - (__dict__), so that the receiver does not need to have the logging module - installed in order to process the logging event. - - To unpickle the record at the receiving end into a LogRecord, use the - makeLogRecord function. - """ - - def __init__(self, host, port): - """ - Initializes the handler with a specific host address and port. - - The attribute 'closeOnError' is set to 1 - which means that if - a socket error occurs, the socket is silently closed and then - reopened on the next logging call. - """ - logging.Handler.__init__(self) - self.host = host - self.port = port - self.sock = None - self.closeOnError = 0 - - def makeSocket(self): - """ - A factory method which allows subclasses to define the precise - type of socket they want. - """ - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.connect((self.host, self.port)) - return s - - def send(self, s): - """ - Send a pickled string to the socket. - - This function allows for partial sends which can happen when the - network is busy. - """ - if hasattr(self.sock, "sendall"): - self.sock.sendall(s) - else: - sentsofar = 0 - left = len(s) - while left > 0: - sent = self.sock.send(s[sentsofar:]) - sentsofar = sentsofar + sent - left = left - sent - - def makePickle(self, record): - """ - Pickles the record in binary format with a length prefix, and - returns it ready for transmission across the socket. - """ - s = cPickle.dumps(record.__dict__, 1) - #n = len(s) - #slen = "%c%c" % ((n >> 8) & 0xFF, n & 0xFF) - slen = struct.pack(">L", len(s)) - return slen + s - - def handleError(self, record): - """ - Handle an error during logging. - - An error has occurred during logging. Most likely cause - - connection lost. Close the socket so that we can retry on the - next event. - """ - if self.closeOnError and self.sock: - self.sock.close() - self.sock = None #try to reconnect next time - else: - logging.Handler.handleError(self, record) - - def emit(self, record): - """ - Emit a record. - - Pickles the record and writes it to the socket in binary format. - If there is an error with the socket, silently drop the packet. - If there was a problem with the socket, re-establishes the - socket. - """ - try: - s = self.makePickle(record) - if not self.sock: - self.sock = self.makeSocket() - self.send(s) - except: - self.handleError(record) - - def close(self): - """ - Closes the socket. - """ - if self.sock: - self.sock.close() - self.sock = None - -class DatagramHandler(SocketHandler): - """ - A handler class which writes logging records, in pickle format, to - a datagram socket. The pickle which is sent is that of the LogRecord's - attribute dictionary (__dict__), so that the receiver does not need to - have the logging module installed in order to process the logging event. - - To unpickle the record at the receiving end into a LogRecord, use the - makeLogRecord function. - - """ - def __init__(self, host, port): - """ - Initializes the handler with a specific host address and port. - """ - SocketHandler.__init__(self, host, port) - self.closeOnError = 0 - - def makeSocket(self): - """ - The factory method of SocketHandler is here overridden to create - a UDP socket (SOCK_DGRAM). - """ - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - return s - - def send(self, s): - """ - Send a pickled string to a socket. - - This function no longer allows for partial sends which can happen - when the network is busy - UDP does not guarantee delivery and - can deliver packets out of sequence. - """ - self.sock.sendto(s, (self.host, self.port)) - -class SysLogHandler(logging.Handler): - """ - A handler class which sends formatted logging records to a syslog - server. Based on Sam Rushing's syslog module: - http://www.nightmare.com/squirl/python-ext/misc/syslog.py - Contributed by Nicolas Untz (after which minor refactoring changes - have been made). - """ - - # from : - # ====================================================================== - # priorities/facilities are encoded into a single 32-bit quantity, where - # the bottom 3 bits are the priority (0-7) and the top 28 bits are the - # facility (0-big number). Both the priorities and the facilities map - # roughly one-to-one to strings in the syslogd(8) source code. This - # mapping is included in this file. - # - # priorities (these are ordered) - - LOG_EMERG = 0 # system is unusable - LOG_ALERT = 1 # action must be taken immediately - LOG_CRIT = 2 # critical conditions - LOG_ERR = 3 # error conditions - LOG_WARNING = 4 # warning conditions - LOG_NOTICE = 5 # normal but significant condition - LOG_INFO = 6 # informational - LOG_DEBUG = 7 # debug-level messages - - # facility codes - LOG_KERN = 0 # kernel messages - LOG_USER = 1 # random user-level messages - LOG_MAIL = 2 # mail system - LOG_DAEMON = 3 # system daemons - LOG_AUTH = 4 # security/authorization messages - LOG_SYSLOG = 5 # messages generated internally by syslogd - LOG_LPR = 6 # line printer subsystem - LOG_NEWS = 7 # network news subsystem - LOG_UUCP = 8 # UUCP subsystem - LOG_CRON = 9 # clock daemon - LOG_AUTHPRIV = 10 # security/authorization messages (private) - - # other codes through 15 reserved for system use - LOG_LOCAL0 = 16 # reserved for local use - LOG_LOCAL1 = 17 # reserved for local use - LOG_LOCAL2 = 18 # reserved for local use - LOG_LOCAL3 = 19 # reserved for local use - LOG_LOCAL4 = 20 # reserved for local use - LOG_LOCAL5 = 21 # reserved for local use - LOG_LOCAL6 = 22 # reserved for local use - LOG_LOCAL7 = 23 # reserved for local use - - priority_names = { - "alert": LOG_ALERT, - "crit": LOG_CRIT, - "critical": LOG_CRIT, - "debug": LOG_DEBUG, - "emerg": LOG_EMERG, - "err": LOG_ERR, - "error": LOG_ERR, # DEPRECATED - "info": LOG_INFO, - "notice": LOG_NOTICE, - "panic": LOG_EMERG, # DEPRECATED - "warn": LOG_WARNING, # DEPRECATED - "warning": LOG_WARNING, - } - - facility_names = { - "auth": LOG_AUTH, - "authpriv": LOG_AUTHPRIV, - "cron": LOG_CRON, - "daemon": LOG_DAEMON, - "kern": LOG_KERN, - "lpr": LOG_LPR, - "mail": LOG_MAIL, - "news": LOG_NEWS, - "security": LOG_AUTH, # DEPRECATED - "syslog": LOG_SYSLOG, - "user": LOG_USER, - "uucp": LOG_UUCP, - "local0": LOG_LOCAL0, - "local1": LOG_LOCAL1, - "local2": LOG_LOCAL2, - "local3": LOG_LOCAL3, - "local4": LOG_LOCAL4, - "local5": LOG_LOCAL5, - "local6": LOG_LOCAL6, - "local7": LOG_LOCAL7, - } - - def __init__(self, address=('localhost', SYSLOG_UDP_PORT), facility=LOG_USER): - """ - Initialize a handler. - - If address is specified as a string, UNIX socket is used. - If facility is not specified, LOG_USER is used. - """ - logging.Handler.__init__(self) - - self.address = address - self.facility = facility - if type(address) == types.StringType: - self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) - # syslog may require either DGRAM or STREAM sockets - try: - self.socket.connect(address) - except socket.error: - self.socket.close() - self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - self.socket.connect(address) - self.unixsocket = 1 - else: - self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - self.unixsocket = 0 - - self.formatter = None - - # curious: when talking to the unix-domain '/dev/log' socket, a - # zero-terminator seems to be required. this string is placed - # into a class variable so that it can be overridden if - # necessary. - log_format_string = '<%d>%s\000' - - def encodePriority (self, facility, priority): - """ - Encode the facility and priority. You can pass in strings or - integers - if strings are passed, the facility_names and - priority_names mapping dictionaries are used to convert them to - integers. - """ - if type(facility) == types.StringType: - facility = self.facility_names[facility] - if type(priority) == types.StringType: - priority = self.priority_names[priority] - return (facility << 3) | priority - - def close (self): - """ - Closes the socket. - """ - if self.unixsocket: - self.socket.close() - - def emit(self, record): - """ - Emit a record. - - The record is formatted, and then sent to the syslog server. If - exception information is present, it is NOT sent to the server. - """ - msg = self.format(record) - """ - We need to convert record level to lowercase, maybe this will - change in the future. - """ - msg = self.log_format_string % ( - self.encodePriority(self.facility, - string.lower(record.levelname)), - msg) - try: - if self.unixsocket: - self.socket.send(msg) - else: - self.socket.sendto(msg, self.address) - except: - self.handleError(record) - -class SMTPHandler(logging.Handler): - """ - A handler class which sends an SMTP email for each logging event. - """ - def __init__(self, mailhost, fromaddr, toaddrs, subject): - """ - Initialize the handler. - - Initialize the instance with the from and to addresses and subject - line of the email. To specify a non-standard SMTP port, use the - (host, port) tuple format for the mailhost argument. - """ - logging.Handler.__init__(self) - if type(mailhost) == types.TupleType: - host, port = mailhost - self.mailhost = host - self.mailport = port - else: - self.mailhost = mailhost - self.mailport = None - self.fromaddr = fromaddr - if type(toaddrs) == types.StringType: - toaddrs = [toaddrs] - self.toaddrs = toaddrs - self.subject = subject - - def getSubject(self, record): - """ - Determine the subject for the email. - - If you want to specify a subject line which is record-dependent, - override this method. - """ - return self.subject - - weekdayname = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - - monthname = [None, - 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', - 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - - def date_time(self): - """Return the current date and time formatted for a MIME header.""" - year, month, day, hh, mm, ss, wd, y, z = time.gmtime(time.time()) - s = "%s, %02d %3s %4d %02d:%02d:%02d GMT" % ( - self.weekdayname[wd], - day, self.monthname[month], year, - hh, mm, ss) - return s - - def emit(self, record): - """ - Emit a record. - - Format the record and send it to the specified addressees. - """ - try: - import smtplib - port = self.mailport - if not port: - port = smtplib.SMTP_PORT - smtp = smtplib.SMTP(self.mailhost, port) - msg = self.format(record) - msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\nDate: %s\r\n\r\n%s" % ( - self.fromaddr, - string.join(self.toaddrs, ","), - self.getSubject(record), - self.date_time(), msg) - smtp.sendmail(self.fromaddr, self.toaddrs, msg) - smtp.quit() - except: - self.handleError(record) - -class NTEventLogHandler(logging.Handler): - """ - A handler class which sends events to the NT Event Log. Adds a - registry entry for the specified application name. If no dllname is - provided, win32service.pyd (which contains some basic message - placeholders) is used. Note that use of these placeholders will make - your event logs big, as the entire message source is held in the log. - If you want slimmer logs, you have to pass in the name of your own DLL - which contains the message definitions you want to use in the event log. - """ - def __init__(self, appname, dllname=None, logtype="Application"): - logging.Handler.__init__(self) - try: - import win32evtlogutil, win32evtlog - self.appname = appname - self._welu = win32evtlogutil - if not dllname: - dllname = os.path.split(self._welu.__file__) - dllname = os.path.split(dllname[0]) - dllname = os.path.join(dllname[0], r'win32service.pyd') - self.dllname = dllname - self.logtype = logtype - self._welu.AddSourceToRegistry(appname, dllname, logtype) - self.deftype = win32evtlog.EVENTLOG_ERROR_TYPE - self.typemap = { - logging.DEBUG : win32evtlog.EVENTLOG_INFORMATION_TYPE, - logging.INFO : win32evtlog.EVENTLOG_INFORMATION_TYPE, - logging.WARNING : win32evtlog.EVENTLOG_WARNING_TYPE, - logging.ERROR : win32evtlog.EVENTLOG_ERROR_TYPE, - logging.CRITICAL: win32evtlog.EVENTLOG_ERROR_TYPE, - } - except ImportError: - print "The Python Win32 extensions for NT (service, event "\ - "logging) appear not to be available." - self._welu = None - - def getMessageID(self, record): - """ - Return the message ID for the event record. If you are using your - own messages, you could do this by having the msg passed to the - logger being an ID rather than a formatting string. Then, in here, - you could use a dictionary lookup to get the message ID. This - version returns 1, which is the base message ID in win32service.pyd. - """ - return 1 - - def getEventCategory(self, record): - """ - Return the event category for the record. - - Override this if you want to specify your own categories. This version - returns 0. - """ - return 0 - - def getEventType(self, record): - """ - Return the event type for the record. - - Override this if you want to specify your own types. This version does - a mapping using the handler's typemap attribute, which is set up in - __init__() to a dictionary which contains mappings for DEBUG, INFO, - WARNING, ERROR and CRITICAL. If you are using your own levels you will - either need to override this method or place a suitable dictionary in - the handler's typemap attribute. - """ - return self.typemap.get(record.levelno, self.deftype) - - def emit(self, record): - """ - Emit a record. - - Determine the message ID, event category and event type. Then - log the message in the NT event log. - """ - if self._welu: - try: - id = self.getMessageID(record) - cat = self.getEventCategory(record) - type = self.getEventType(record) - msg = self.format(record) - self._welu.ReportEvent(self.appname, id, cat, type, [msg]) - except: - self.handleError(record) - - def close(self): - """ - Clean up this handler. - - You can remove the application name from the registry as a - source of event log entries. However, if you do this, you will - not be able to see the events as you intended in the Event Log - Viewer - it needs to be able to access the registry to get the - DLL name. - """ - #self._welu.RemoveSourceFromRegistry(self.appname, self.logtype) - pass - -class HTTPHandler(logging.Handler): - """ - A class which sends records to a Web server, using either GET or - POST semantics. - """ - def __init__(self, host, url, method="GET"): - """ - Initialize the instance with the host, the request URL, and the method - ("GET" or "POST") - """ - logging.Handler.__init__(self) - method = string.upper(method) - if method not in ["GET", "POST"]: - raise ValueError, "method must be GET or POST" - self.host = host - self.url = url - self.method = method - - def mapLogRecord(self, record): - """ - Default implementation of mapping the log record into a dict - that is send as the CGI data. Overwrite in your class. - Contributed by Franz Glasner. - """ - return record.__dict__ - - def emit(self, record): - """ - Emit a record. - - Send the record to the Web server as an URL-encoded dictionary - """ - try: - import httplib, urllib - h = httplib.HTTP(self.host) - url = self.url - data = urllib.urlencode(self.mapLogRecord(record)) - if self.method == "GET": - if (string.find(url, '?') >= 0): - sep = '&' - else: - sep = '?' - url = url + "%c%s" % (sep, data) - h.putrequest(self.method, url) - if self.method == "POST": - h.putheader("Content-length", str(len(data))) - h.endheaders() - if self.method == "POST": - h.send(data) - h.getreply() #can't do anything with the result - except: - self.handleError(record) - -class BufferingHandler(logging.Handler): - """ - A handler class which buffers logging records in memory. Whenever each - record is added to the buffer, a check is made to see if the buffer should - be flushed. If it should, then flush() is expected to do what's needed. - """ - def __init__(self, capacity): - """ - Initialize the handler with the buffer size. - """ - logging.Handler.__init__(self) - self.capacity = capacity - self.buffer = [] - - def shouldFlush(self, record): - """ - Should the handler flush its buffer? - - Returns true if the buffer is up to capacity. This method can be - overridden to implement custom flushing strategies. - """ - return (len(self.buffer) >= self.capacity) - - def emit(self, record): - """ - Emit a record. - - Append the record. If shouldFlush() tells us to, call flush() to process - the buffer. - """ - self.buffer.append(record) - if self.shouldFlush(record): - self.flush() - - def flush(self): - """ - Override to implement custom flushing behaviour. - - This version just zaps the buffer to empty. - """ - self.buffer = [] - -class MemoryHandler(BufferingHandler): - """ - A handler class which buffers logging records in memory, periodically - flushing them to a target handler. Flushing occurs whenever the buffer - is full, or when an event of a certain severity or greater is seen. - """ - def __init__(self, capacity, flushLevel=logging.ERROR, target=None): - """ - Initialize the handler with the buffer size, the level at which - flushing should occur and an optional target. - - Note that without a target being set either here or via setTarget(), - a MemoryHandler is no use to anyone! - """ - BufferingHandler.__init__(self, capacity) - self.flushLevel = flushLevel - self.target = target - - def shouldFlush(self, record): - """ - Check for buffer full or a record at the flushLevel or higher. - """ - return (len(self.buffer) >= self.capacity) or \ - (record.levelno >= self.flushLevel) - - def setTarget(self, target): - """ - Set the target handler for this handler. - """ - self.target = target - - def flush(self): - """ - For a MemoryHandler, flushing means just sending the buffered - records to the target, if there is one. Override if you want - different behaviour. - """ - if self.target: - for record in self.buffer: - self.target.handle(record) - self.buffer = [] - - def close(self): - """ - Flush, set the target to None and lose the buffer. - """ - self.flush() - self.target = None - self.buffer = [] diff --git a/code/planet/feedparser.py b/code/planet/feedparser.py deleted file mode 100644 index cd7ac83d..00000000 --- a/code/planet/feedparser.py +++ /dev/null @@ -1,2937 +0,0 @@ -#!/usr/bin/env python -"""Universal feed parser - -Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds - -Visit http://feedparser.org/ for the latest version -Visit http://feedparser.org/docs/ for the latest documentation - -Required: Python 2.1 or later -Recommended: Python 2.3 or later -Recommended: CJKCodecs and iconv_codec -""" - -__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" -__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE.""" -__author__ = "Mark Pilgrim " -__contributors__ = ["Jason Diamond ", - "John Beimler ", - "Fazal Majid ", - "Aaron Swartz ", - "Kevin Marks "] -_debug = 0 - -# HTTP "User-Agent" header to send to servers when downloading feeds. -# If you are embedding feedparser in a larger application, you should -# change this to your application name and URL. -USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ - -# HTTP "Accept" header to send to servers when downloading feeds. If you don't -# want to send an Accept header, set this to None. -ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" - -# List of preferred XML parsers, by SAX driver name. These will be tried first, -# but if they're not installed, Python will keep searching through its own list -# of pre-installed parsers until it finds one that supports everything we need. -PREFERRED_XML_PARSERS = ["drv_libxml2"] - -# If you want feedparser to automatically run HTML markup through HTML Tidy, set -# this to 1. Requires mxTidy -# or utidylib . -TIDY_MARKUP = 0 - -# List of Python interfaces for HTML Tidy, in order of preference. Only useful -# if TIDY_MARKUP = 1 -PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] - -# ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 -try: - from cStringIO import StringIO as _StringIO -except: - from StringIO import StringIO as _StringIO - -# ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- - -# gzip is included with most Python distributions, but may not be available if you compiled your own -try: - import gzip -except: - gzip = None -try: - import zlib -except: - zlib = None - -# If a real XML parser is available, feedparser will attempt to use it. feedparser has -# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the -# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some -# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. -try: - import xml.sax - xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers - from xml.sax.saxutils import escape as _xmlescape - _XML_AVAILABLE = 1 -except: - _XML_AVAILABLE = 0 - def _xmlescape(data,entities={}): - data = data.replace('&', '&') - data = data.replace('>', '>') - data = data.replace('<', '<') - for char, entity in entities: - data = data.replace(char, entity) - return data - -# base64 support for Atom feeds that contain embedded binary data -try: - import base64, binascii -except: - base64 = binascii = None - -# cjkcodecs and iconv_codec provide support for more character encodings. -# Both are available from http://cjkpython.i18n.org/ -try: - import cjkcodecs.aliases -except: - pass -try: - import iconv_codec -except: - pass - -# chardet library auto-detects character encodings -# Download from http://chardet.feedparser.org/ -try: - import chardet - if _debug: - import chardet.constants - chardet.constants._debug = 1 -except: - chardet = None - -# ---------- don't touch these ---------- -class ThingsNobodyCaresAboutButMe(Exception): pass -class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass -class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass -class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass -class UndeclaredNamespace(Exception): pass - -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -sgmllib.special = re.compile('' % (tag, self.strattrs(attrs)), escape=0) - - # match namespaces - if tag.find(':') <> -1: - prefix, suffix = tag.split(':', 1) - else: - prefix, suffix = '', tag - prefix = self.namespacemap.get(prefix, prefix) - if prefix: - prefix = prefix + '_' - - # special hack for better tracking of empty textinput/image elements in illformed feeds - if (not prefix) and tag not in ('title', 'link', 'description', 'name'): - self.intextinput = 0 - if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): - self.inimage = 0 - - # call special handler (if defined) or default handler - methodname = '_start_' + prefix + suffix - try: - method = getattr(self, methodname) - return method(attrsD) - except AttributeError: - return self.push(prefix + suffix, 1) - - def unknown_endtag(self, tag): - if _debug: sys.stderr.write('end %s\n' % tag) - # match namespaces - if tag.find(':') <> -1: - prefix, suffix = tag.split(':', 1) - else: - prefix, suffix = '', tag - prefix = self.namespacemap.get(prefix, prefix) - if prefix: - prefix = prefix + '_' - - # call special handler (if defined) or default handler - methodname = '_end_' + prefix + suffix - try: - method = getattr(self, methodname) - method() - except AttributeError: - self.pop(prefix + suffix) - - # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): - # element declared itself as escaped markup, but it isn't really - self.contentparams['type'] = 'application/xhtml+xml' - if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': - tag = tag.split(':')[-1] - self.handle_data('' % tag, escape=0) - - # track xml:base and xml:lang going out of scope - if self.basestack: - self.basestack.pop() - if self.basestack and self.basestack[-1]: - self.baseuri = self.basestack[-1] - if self.langstack: - self.langstack.pop() - if self.langstack: # and (self.langstack[-1] is not None): - self.lang = self.langstack[-1] - - def handle_charref(self, ref): - # called for each character reference, e.g. for ' ', ref will be '160' - if not self.elementstack: return - ref = ref.lower() - if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): - text = '&#%s;' % ref - else: - if ref[0] == 'x': - c = int(ref[1:], 16) - else: - c = int(ref) - text = unichr(c).encode('utf-8') - self.elementstack[-1][2].append(text) - - def handle_entityref(self, ref): - # called for each entity reference, e.g. for '©', ref will be 'copy' - if not self.elementstack: return - if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) - if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): - text = '&%s;' % ref - else: - # entity resolution graciously donated by Aaron Swartz - def name2cp(k): - import htmlentitydefs - if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - k = htmlentitydefs.entitydefs[k] - if k.startswith('&#') and k.endswith(';'): - return int(k[2:-1]) # not in latin-1 - return ord(k) - try: name2cp(ref) - except KeyError: text = '&%s;' % ref - else: text = unichr(name2cp(ref)).encode('utf-8') - self.elementstack[-1][2].append(text) - - def handle_data(self, text, escape=1): - # called for each block of plain text, i.e. outside of any tag and - # not containing any character or entity references - if not self.elementstack: return - if escape and self.contentparams.get('type') == 'application/xhtml+xml': - text = _xmlescape(text) - self.elementstack[-1][2].append(text) - - def handle_comment(self, text): - # called for each comment, e.g. - pass - - def handle_pi(self, text): - # called for each processing instruction, e.g. - pass - - def handle_decl(self, text): - pass - - def parse_declaration(self, i): - # override internal declaration handler to handle CDATA blocks - if _debug: sys.stderr.write('entering parse_declaration\n') - if self.rawdata[i:i+9] == '', i) - if k == -1: k = len(self.rawdata) - self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) - return k+3 - else: - k = self.rawdata.find('>', i) - return k+1 - - def mapContentType(self, contentType): - contentType = contentType.lower() - if contentType == 'text': - contentType = 'text/plain' - elif contentType == 'html': - contentType = 'text/html' - elif contentType == 'xhtml': - contentType = 'application/xhtml+xml' - return contentType - - def trackNamespace(self, prefix, uri): - loweruri = uri.lower() - if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: - self.version = 'rss090' - if loweruri == 'http://purl.org/rss/1.0/' and not self.version: - self.version = 'rss10' - if loweruri == 'http://www.w3.org/2005/atom' and not self.version: - self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') <> -1: - # match any backend.userland.com namespace - uri = 'http://backend.userland.com/rss' - loweruri = uri - if self._matchnamespaces.has_key(loweruri): - self.namespacemap[prefix] = self._matchnamespaces[loweruri] - self.namespacesInUse[self._matchnamespaces[loweruri]] = uri - else: - self.namespacesInUse[prefix or ''] = uri - - def resolveURI(self, uri): - return _urljoin(self.baseuri or '', uri) - - def decodeEntities(self, element, data): - return data - - def strattrs(self, attrs): - return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) - - def push(self, element, expectingText): - self.elementstack.append([element, expectingText, []]) - - def pop(self, element, stripWhitespace=1): - if not self.elementstack: return - if self.elementstack[-1][0] != element: return - - element, expectingText, pieces = self.elementstack.pop() - - if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': - # remove enclosing child element, but only if it is a
and - # only if all the remaining content is nested underneath it. - # This means that the divs would be retained in the following: - #
foo
bar
- if pieces and (pieces[0] == '
' or pieces[0].startswith('
': - depth = 0 - for piece in pieces[:-1]: - if piece.startswith(''): - depth += 1 - else: - pieces = pieces[1:-1] - - output = ''.join(pieces) - if stripWhitespace: - output = output.strip() - if not expectingText: return output - - # decode base64 content - if base64 and self.contentparams.get('base64', 0): - try: - output = base64.decodestring(output) - except binascii.Error: - pass - except binascii.Incomplete: - pass - - # resolve relative URIs - if (element in self.can_be_relative_uri) and output: - output = self.resolveURI(output) - - # decode entities within embedded markup - if not self.contentparams.get('base64', 0): - output = self.decodeEntities(element, output) - - # remove temporary cruft from contentparams - try: - del self.contentparams['mode'] - except KeyError: - pass - try: - del self.contentparams['base64'] - except KeyError: - pass - - # resolve relative URIs within embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: - if element in self.can_contain_relative_uris: - output = _resolveRelativeURIs(output, self.baseuri, self.encoding) - - # sanitize embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: - if element in self.can_contain_dangerous_markup: - output = _sanitizeHTML(output, self.encoding) - - if self.encoding and type(output) != type(u''): - try: - output = unicode(output, self.encoding) - except: - pass - - # address common error where people take data that is already - # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding=='utf-8' and type(output) == type(u''): - try: - output = unicode(output.encode('iso-8859-1'), 'utf-8') - except: - pass - - # map win-1252 extensions to the proper code points - if type(output) == type(u''): - output = u''.join([c in cp1252 and cp1252[c] or c for c in output]) - - # categories/tags/keywords/whatever are handled in _end_category - if element == 'category': - return output - - # store output in appropriate place(s) - if self.inentry and not self.insource: - if element == 'content': - self.entries[-1].setdefault(element, []) - contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.entries[-1][element].append(contentparams) - elif element == 'link': - self.entries[-1][element] = output - if output: - self.entries[-1]['links'][-1]['href'] = output - else: - if element == 'description': - element = 'summary' - self.entries[-1][element] = output - if self.incontent: - contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.entries[-1][element + '_detail'] = contentparams - elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): - context = self._getContext() - if element == 'description': - element = 'subtitle' - context[element] = output - if element == 'link': - context['links'][-1]['href'] = output - elif self.incontent: - contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - context[element + '_detail'] = contentparams - return output - - def pushContent(self, tag, attrsD, defaultContentType, expectingText): - self.incontent += 1 - self.contentparams = FeedParserDict({ - 'type': self.mapContentType(attrsD.get('type', defaultContentType)), - 'language': self.lang, - 'base': self.baseuri}) - self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) - self.push(tag, expectingText) - - def popContent(self, tag): - value = self.pop(tag) - self.incontent -= 1 - self.contentparams.clear() - return value - - def _mapToStandardPrefix(self, name): - colonpos = name.find(':') - if colonpos <> -1: - prefix = name[:colonpos] - suffix = name[colonpos+1:] - prefix = self.namespacemap.get(prefix, prefix) - name = prefix + ':' + suffix - return name - - def _getAttribute(self, attrsD, name): - return attrsD.get(self._mapToStandardPrefix(name)) - - def _isBase64(self, attrsD, contentparams): - if attrsD.get('mode', '') == 'base64': - return 1 - if self.contentparams['type'].startswith('text/'): - return 0 - if self.contentparams['type'].endswith('+xml'): - return 0 - if self.contentparams['type'].endswith('/xml'): - return 0 - return 1 - - def _itsAnHrefDamnIt(self, attrsD): - href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) - if href: - try: - del attrsD['url'] - except KeyError: - pass - try: - del attrsD['uri'] - except KeyError: - pass - attrsD['href'] = href - return attrsD - - def _save(self, key, value): - context = self._getContext() - context.setdefault(key, value) - - def _start_rss(self, attrsD): - versionmap = {'0.91': 'rss091u', - '0.92': 'rss092', - '0.93': 'rss093', - '0.94': 'rss094'} - if not self.version: - attr_version = attrsD.get('version', '') - version = versionmap.get(attr_version) - if version: - self.version = version - elif attr_version.startswith('2.'): - self.version = 'rss20' - else: - self.version = 'rss' - - def _start_dlhottitles(self, attrsD): - self.version = 'hotrss' - - def _start_channel(self, attrsD): - self.infeed = 1 - self._cdf_common(attrsD) - _start_feedinfo = _start_channel - - def _cdf_common(self, attrsD): - if attrsD.has_key('lastmod'): - self._start_modified({}) - self.elementstack[-1][-1] = attrsD['lastmod'] - self._end_modified() - if attrsD.has_key('href'): - self._start_link({}) - self.elementstack[-1][-1] = attrsD['href'] - self._end_link() - - def _start_feed(self, attrsD): - self.infeed = 1 - versionmap = {'0.1': 'atom01', - '0.2': 'atom02', - '0.3': 'atom03'} - if not self.version: - attr_version = attrsD.get('version') - version = versionmap.get(attr_version) - if version: - self.version = version - else: - self.version = 'atom' - - def _end_channel(self): - self.infeed = 0 - _end_feed = _end_channel - - def _start_image(self, attrsD): - self.inimage = 1 - self.push('image', 0) - context = self._getContext() - context.setdefault('image', FeedParserDict()) - - def _end_image(self): - self.pop('image') - self.inimage = 0 - - def _start_textinput(self, attrsD): - self.intextinput = 1 - self.push('textinput', 0) - context = self._getContext() - context.setdefault('textinput', FeedParserDict()) - _start_textInput = _start_textinput - - def _end_textinput(self): - self.pop('textinput') - self.intextinput = 0 - _end_textInput = _end_textinput - - def _start_author(self, attrsD): - self.inauthor = 1 - self.push('author', 1) - _start_managingeditor = _start_author - _start_dc_author = _start_author - _start_dc_creator = _start_author - _start_itunes_author = _start_author - - def _end_author(self): - self.pop('author') - self.inauthor = 0 - self._sync_author_detail() - _end_managingeditor = _end_author - _end_dc_author = _end_author - _end_dc_creator = _end_author - _end_itunes_author = _end_author - - def _start_itunes_owner(self, attrsD): - self.inpublisher = 1 - self.push('publisher', 0) - - def _end_itunes_owner(self): - self.pop('publisher') - self.inpublisher = 0 - self._sync_author_detail('publisher') - - def _start_contributor(self, attrsD): - self.incontributor = 1 - context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('contributor', 0) - - def _end_contributor(self): - self.pop('contributor') - self.incontributor = 0 - - def _start_dc_contributor(self, attrsD): - self.incontributor = 1 - context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('name', 0) - - def _end_dc_contributor(self): - self._end_name() - self.incontributor = 0 - - def _start_name(self, attrsD): - self.push('name', 0) - _start_itunes_name = _start_name - - def _end_name(self): - value = self.pop('name') - if self.inpublisher: - self._save_author('name', value, 'publisher') - elif self.inauthor: - self._save_author('name', value) - elif self.incontributor: - self._save_contributor('name', value) - elif self.intextinput: - context = self._getContext() - context['textinput']['name'] = value - _end_itunes_name = _end_name - - def _start_width(self, attrsD): - self.push('width', 0) - - def _end_width(self): - value = self.pop('width') - try: - value = int(value) - except: - value = 0 - if self.inimage: - context = self._getContext() - context['image']['width'] = value - - def _start_height(self, attrsD): - self.push('height', 0) - - def _end_height(self): - value = self.pop('height') - try: - value = int(value) - except: - value = 0 - if self.inimage: - context = self._getContext() - context['image']['height'] = value - - def _start_url(self, attrsD): - self.push('href', 1) - _start_homepage = _start_url - _start_uri = _start_url - - def _end_url(self): - value = self.pop('href') - if self.inauthor: - self._save_author('href', value) - elif self.incontributor: - self._save_contributor('href', value) - elif self.inimage: - context = self._getContext() - context['image']['href'] = value - elif self.intextinput: - context = self._getContext() - context['textinput']['link'] = value - _end_homepage = _end_url - _end_uri = _end_url - - def _start_email(self, attrsD): - self.push('email', 0) - _start_itunes_email = _start_email - - def _end_email(self): - value = self.pop('email') - if self.inpublisher: - self._save_author('email', value, 'publisher') - elif self.inauthor: - self._save_author('email', value) - elif self.incontributor: - self._save_contributor('email', value) - _end_itunes_email = _end_email - - def _getContext(self): - if self.insource: - context = self.sourcedata - elif self.inentry: - context = self.entries[-1] - else: - context = self.feeddata - return context - - def _save_author(self, key, value, prefix='author'): - context = self._getContext() - context.setdefault(prefix + '_detail', FeedParserDict()) - context[prefix + '_detail'][key] = value - self._sync_author_detail() - - def _save_contributor(self, key, value): - context = self._getContext() - context.setdefault('contributors', [FeedParserDict()]) - context['contributors'][-1][key] = value - - def _sync_author_detail(self, key='author'): - context = self._getContext() - detail = context.get('%s_detail' % key) - if detail: - name = detail.get('name') - email = detail.get('email') - if name and email: - context[key] = '%s (%s)' % (name, email) - elif name: - context[key] = name - elif email: - context[key] = email - else: - author = context.get(key) - if not author: return - emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) - if not emailmatch: return - email = emailmatch.group(0) - # probably a better way to do the following, but it passes all the tests - author = author.replace(email, '') - author = author.replace('()', '') - author = author.strip() - if author and (author[0] == '('): - author = author[1:] - if author and (author[-1] == ')'): - author = author[:-1] - author = author.strip() - context.setdefault('%s_detail' % key, FeedParserDict()) - context['%s_detail' % key]['name'] = author - context['%s_detail' % key]['email'] = email - - def _start_subtitle(self, attrsD): - self.pushContent('subtitle', attrsD, 'text/plain', 1) - _start_tagline = _start_subtitle - _start_itunes_subtitle = _start_subtitle - - def _end_subtitle(self): - self.popContent('subtitle') - _end_tagline = _end_subtitle - _end_itunes_subtitle = _end_subtitle - - def _start_rights(self, attrsD): - self.pushContent('rights', attrsD, 'text/plain', 1) - _start_dc_rights = _start_rights - _start_copyright = _start_rights - - def _end_rights(self): - self.popContent('rights') - _end_dc_rights = _end_rights - _end_copyright = _end_rights - - def _start_item(self, attrsD): - self.entries.append(FeedParserDict()) - self.push('item', 0) - self.inentry = 1 - self.guidislink = 0 - id = self._getAttribute(attrsD, 'rdf:about') - if id: - context = self._getContext() - context['id'] = id - self._cdf_common(attrsD) - _start_entry = _start_item - _start_product = _start_item - - def _end_item(self): - self.pop('item') - self.inentry = 0 - _end_entry = _end_item - - def _start_dc_language(self, attrsD): - self.push('language', 1) - _start_language = _start_dc_language - - def _end_dc_language(self): - self.lang = self.pop('language') - _end_language = _end_dc_language - - def _start_dc_publisher(self, attrsD): - self.push('publisher', 1) - _start_webmaster = _start_dc_publisher - - def _end_dc_publisher(self): - self.pop('publisher') - self._sync_author_detail('publisher') - _end_webmaster = _end_dc_publisher - - def _start_published(self, attrsD): - self.push('published', 1) - _start_dcterms_issued = _start_published - _start_issued = _start_published - - def _end_published(self): - value = self.pop('published') - self._save('published_parsed', _parse_date(value)) - _end_dcterms_issued = _end_published - _end_issued = _end_published - - def _start_updated(self, attrsD): - self.push('updated', 1) - _start_modified = _start_updated - _start_dcterms_modified = _start_updated - _start_pubdate = _start_updated - _start_dc_date = _start_updated - - def _end_updated(self): - value = self.pop('updated') - parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value) - _end_modified = _end_updated - _end_dcterms_modified = _end_updated - _end_pubdate = _end_updated - _end_dc_date = _end_updated - - def _start_created(self, attrsD): - self.push('created', 1) - _start_dcterms_created = _start_created - - def _end_created(self): - value = self.pop('created') - self._save('created_parsed', _parse_date(value)) - _end_dcterms_created = _end_created - - def _start_expirationdate(self, attrsD): - self.push('expired', 1) - - def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired'))) - - def _start_cc_license(self, attrsD): - self.push('license', 1) - value = self._getAttribute(attrsD, 'rdf:resource') - if value: - self.elementstack[-1][2].append(value) - self.pop('license') - - def _start_creativecommons_license(self, attrsD): - self.push('license', 1) - - def _end_creativecommons_license(self): - self.pop('license') - - def _addTag(self, term, scheme, label): - context = self._getContext() - tags = context.setdefault('tags', []) - if (not term) and (not scheme) and (not label): return - value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) - if value not in tags: - tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) - - def _start_category(self, attrsD): - if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) - term = attrsD.get('term') - scheme = attrsD.get('scheme', attrsD.get('domain')) - label = attrsD.get('label') - self._addTag(term, scheme, label) - self.push('category', 1) - _start_dc_subject = _start_category - _start_keywords = _start_category - - def _end_itunes_keywords(self): - for term in self.pop('itunes_keywords').split(): - self._addTag(term, 'http://www.itunes.com/', None) - - def _start_itunes_category(self, attrsD): - self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) - self.push('category', 1) - - def _end_category(self): - value = self.pop('category') - if not value: return - context = self._getContext() - tags = context['tags'] - if value and len(tags) and not tags[-1]['term']: - tags[-1]['term'] = value - else: - self._addTag(value, None, None) - _end_dc_subject = _end_category - _end_keywords = _end_category - _end_itunes_category = _end_category - - def _start_cloud(self, attrsD): - self._getContext()['cloud'] = FeedParserDict(attrsD) - - def _start_link(self, attrsD): - attrsD.setdefault('rel', 'alternate') - attrsD.setdefault('type', 'text/html') - attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) - expectingText = self.infeed or self.inentry or self.insource - context = self._getContext() - context.setdefault('links', []) - context['links'].append(FeedParserDict(attrsD)) - if attrsD['rel'] == 'enclosure': - self._start_enclosure(attrsD) - if attrsD.has_key('href'): - expectingText = 0 - if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): - context['link'] = attrsD['href'] - else: - self.push('link', expectingText) - _start_producturl = _start_link - - def _end_link(self): - value = self.pop('link') - context = self._getContext() - if self.intextinput: - context['textinput']['link'] = value - if self.inimage: - context['image']['link'] = value - _end_producturl = _end_link - - def _start_guid(self, attrsD): - self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') - self.push('id', 1) - - def _end_guid(self): - value = self.pop('id') - self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) - if self.guidislink: - # guid acts as link, but only if 'ispermalink' is not present or is 'true', - # and only if the item doesn't already have a link element - self._save('link', value) - - def _start_title(self, attrsD): - self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) - def _start_title_low_pri(self, attrsD): - if not self._getContext().has_key('title'): - self._start_title(attrsD) - _start_dc_title = _start_title_low_pri - _start_media_title = _start_title_low_pri - - def _end_title(self): - value = self.popContent('title') - context = self._getContext() - if self.intextinput: - context['textinput']['title'] = value - elif self.inimage: - context['image']['title'] = value - def _end_title_low_pri(self): - if not self._getContext().has_key('title'): - self._end_title() - _end_dc_title = _end_title_low_pri - _end_media_title = _end_title_low_pri - - def _start_description(self, attrsD): - context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' - self._start_content(attrsD) - else: - self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) - - def _start_abstract(self, attrsD): - self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) - - def _end_description(self): - if self._summaryKey == 'content': - self._end_content() - else: - value = self.popContent('description') - context = self._getContext() - if self.intextinput: - context['textinput']['description'] = value - elif self.inimage: - context['image']['description'] = value - self._summaryKey = None - _end_abstract = _end_description - - def _start_info(self, attrsD): - self.pushContent('info', attrsD, 'text/plain', 1) - _start_feedburner_browserfriendly = _start_info - - def _end_info(self): - self.popContent('info') - _end_feedburner_browserfriendly = _end_info - - def _start_generator(self, attrsD): - if attrsD: - attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) - self._getContext()['generator_detail'] = FeedParserDict(attrsD) - self.push('generator', 1) - - def _end_generator(self): - value = self.pop('generator') - context = self._getContext() - if context.has_key('generator_detail'): - context['generator_detail']['name'] = value - - def _start_admin_generatoragent(self, attrsD): - self.push('generator', 1) - value = self._getAttribute(attrsD, 'rdf:resource') - if value: - self.elementstack[-1][2].append(value) - self.pop('generator') - self._getContext()['generator_detail'] = FeedParserDict({'href': value}) - - def _start_admin_errorreportsto(self, attrsD): - self.push('errorreportsto', 1) - value = self._getAttribute(attrsD, 'rdf:resource') - if value: - self.elementstack[-1][2].append(value) - self.pop('errorreportsto') - - def _start_summary(self, attrsD): - context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' - self._start_content(attrsD) - else: - self._summaryKey = 'summary' - self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) - _start_itunes_summary = _start_summary - - def _end_summary(self): - if self._summaryKey == 'content': - self._end_content() - else: - self.popContent(self._summaryKey or 'summary') - self._summaryKey = None - _end_itunes_summary = _end_summary - - def _start_enclosure(self, attrsD): - attrsD = self._itsAnHrefDamnIt(attrsD) - self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) - href = attrsD.get('href') - if href: - context = self._getContext() - if not context.get('id'): - context['id'] = href - - def _start_source(self, attrsD): - self.insource = 1 - - def _end_source(self): - self.insource = 0 - self._getContext()['source'] = copy.deepcopy(self.sourcedata) - self.sourcedata.clear() - - def _start_content(self, attrsD): - self.pushContent('content', attrsD, 'text/plain', 1) - src = attrsD.get('src') - if src: - self.contentparams['src'] = src - self.push('content', 1) - - def _start_prodlink(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) - - def _start_body(self, attrsD): - self.pushContent('content', attrsD, 'application/xhtml+xml', 1) - _start_xhtml_body = _start_body - - def _start_content_encoded(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) - _start_fullitem = _start_content_encoded - - def _end_content(self): - copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) - value = self.popContent('content') - if copyToDescription: - self._save('description', value) - _end_body = _end_content - _end_xhtml_body = _end_content - _end_content_encoded = _end_content - _end_fullitem = _end_content - _end_prodlink = _end_content - - def _start_itunes_image(self, attrsD): - self.push('itunes_image', 0) - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) - _start_itunes_link = _start_itunes_image - - def _end_itunes_block(self): - value = self.pop('itunes_block', 0) - self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 - - def _end_itunes_explicit(self): - value = self.pop('itunes_explicit', 0) - self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 - -if _XML_AVAILABLE: - class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): - def __init__(self, baseuri, baselang, encoding): - if _debug: sys.stderr.write('trying StrictFeedParser\n') - xml.sax.handler.ContentHandler.__init__(self) - _FeedParserMixin.__init__(self, baseuri, baselang, encoding) - self.bozo = 0 - self.exc = None - - def startPrefixMapping(self, prefix, uri): - self.trackNamespace(prefix, uri) - - def startElementNS(self, name, qname, attrs): - namespace, localname = name - lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') <> -1: - # match any backend.userland.com namespace - namespace = 'http://backend.userland.com/rss' - lowernamespace = namespace - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] - else: - givenprefix = None - prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): - raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix - if prefix: - localname = prefix + ':' + localname - localname = str(localname).lower() - if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) - - # qname implementation is horribly broken in Python 2.1 (it - # doesn't report any), and slightly broken in Python 2.2 (it - # doesn't report the xml: namespace). So we match up namespaces - # with a known list first, and then possibly override them with - # the qnames the SAX parser gives us (if indeed it gives us any - # at all). Thanks to MatejC for helping me test this and - # tirelessly telling me that it didn't work yet. - attrsD = {} - for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): - lowernamespace = (namespace or '').lower() - prefix = self._matchnamespaces.get(lowernamespace, '') - if prefix: - attrlocalname = prefix + ':' + attrlocalname - attrsD[str(attrlocalname).lower()] = attrvalue - for qname in attrs.getQNames(): - attrsD[str(qname).lower()] = attrs.getValueByQName(qname) - self.unknown_starttag(localname, attrsD.items()) - - def characters(self, text): - self.handle_data(text) - - def endElementNS(self, name, qname): - namespace, localname = name - lowernamespace = str(namespace or '').lower() - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] - else: - givenprefix = '' - prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if prefix: - localname = prefix + ':' + localname - localname = str(localname).lower() - self.unknown_endtag(localname) - - def error(self, exc): - self.bozo = 1 - self.exc = exc - - def fatalError(self, exc): - self.error(exc) - raise exc - -class _BaseHTMLProcessor(sgmllib.SGMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - - def __init__(self, encoding): - self.encoding = encoding - if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) - sgmllib.SGMLParser.__init__(self) - - def reset(self): - self.pieces = [] - sgmllib.SGMLParser.reset(self) - - def _shorttag_replace(self, match): - tag = match.group(1) - if tag in self.elements_no_end_tag: - return '<' + tag + ' />' - else: - return '<' + tag + '>' - - def feed(self, data): - data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace - data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) - data = data.replace(''', "'") - data = data.replace('"', '"') - if self.encoding and type(data) == type(u''): - data = data.encode(self.encoding) - sgmllib.SGMLParser.feed(self, data) - sgmllib.SGMLParser.close(self) - - def normalize_attrs(self, attrs): - # utility method to be called by descendants - attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] - return attrs - - def unknown_starttag(self, tag, attrs): - # called for each start tag - # attrs is a list of (attr, value) tuples - # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
-        uattrs = []
-        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
-        for key, value in attrs:
-            if type(value) != type(u''):
-                value = unicode(value, self.encoding)
-            uattrs.append((unicode(key, self.encoding), value))
-        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
-        if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
-        else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
-
-    def unknown_endtag(self, tag):
-        # called for each end tag, e.g. for 
, tag will be 'pre' - # Reconstruct the original end tag. - if tag not in self.elements_no_end_tag: - self.pieces.append("" % locals()) - - def handle_charref(self, ref): - # called for each character reference, e.g. for ' ', ref will be '160' - # Reconstruct the original character reference. - self.pieces.append('&#%(ref)s;' % locals()) - - def handle_entityref(self, ref): - # called for each entity reference, e.g. for '©', ref will be 'copy' - # Reconstruct the original entity reference. - import htmlentitydefs - if not hasattr(htmlentitydefs, 'name2codepoint') or htmlentitydefs.name2codepoint.has_key(ref): - self.pieces.append('&%(ref)s;' % locals()) - else: - self.pieces.append('&%(ref)s' % locals()) - - def handle_data(self, text): - # called for each block of plain text, i.e. outside of any tag and - # not containing any character or entity references - # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) - self.pieces.append(text) - - def handle_comment(self, text): - # called for each HTML comment, e.g. - # Reconstruct the original comment. - self.pieces.append('' % locals()) - - def handle_pi(self, text): - # called for each processing instruction, e.g. - # Reconstruct original processing instruction. - self.pieces.append('' % locals()) - - def handle_decl(self, text): - # called for the DOCTYPE, if present, e.g. - # - # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) - - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match - def _scan_name(self, i, declstartpos): - rawdata = self.rawdata - n = len(rawdata) - if i == n: - return None, -1 - m = self._new_declname_match(rawdata, i) - if m: - s = m.group() - name = s.strip() - if (i + len(s)) == n: - return None, -1 # end of buffer - return name.lower(), m.end() - else: - self.handle_data(rawdata) -# self.updatepos(declstartpos, i) - return None, -1 - - def output(self): - '''Return processed HTML as a single string''' - return ''.join([str(p) for p in self.pieces]) - -class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): - def __init__(self, baseuri, baselang, encoding): - sgmllib.SGMLParser.__init__(self) - _FeedParserMixin.__init__(self, baseuri, baselang, encoding) - - def decodeEntities(self, element, data): - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace('"', '"') - data = data.replace(''', ''') - data = data.replace(''', ''') - if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace(''', "'") - return data - - def strattrs(self, attrs): - return ''.join([' %s="%s"' % t for t in attrs]) - -class _RelativeURIResolver(_BaseHTMLProcessor): - relative_uris = [('a', 'href'), - ('applet', 'codebase'), - ('area', 'href'), - ('blockquote', 'cite'), - ('body', 'background'), - ('del', 'cite'), - ('form', 'action'), - ('frame', 'longdesc'), - ('frame', 'src'), - ('iframe', 'longdesc'), - ('iframe', 'src'), - ('head', 'profile'), - ('img', 'longdesc'), - ('img', 'src'), - ('img', 'usemap'), - ('input', 'src'), - ('input', 'usemap'), - ('ins', 'cite'), - ('link', 'href'), - ('object', 'classid'), - ('object', 'codebase'), - ('object', 'data'), - ('object', 'usemap'), - ('q', 'cite'), - ('script', 'src')] - - def __init__(self, baseuri, encoding): - _BaseHTMLProcessor.__init__(self, encoding) - self.baseuri = baseuri - - def resolveURI(self, uri): - return _urljoin(self.baseuri, uri) - - def unknown_starttag(self, tag, attrs): - attrs = self.normalize_attrs(attrs) - attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] - _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - -def _resolveRelativeURIs(htmlSource, baseURI, encoding): - if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') - p = _RelativeURIResolver(baseURI, encoding) - p.feed(htmlSource) - return p.output() - -class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', - 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', - 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', - 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', - 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', - 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', - 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', - 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', - 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', - 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', - 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', - 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'] - - unacceptable_elements_with_end_tag = ['script', 'applet'] - - def reset(self): - _BaseHTMLProcessor.reset(self) - self.unacceptablestack = 0 - - def unknown_starttag(self, tag, attrs): - if not tag in self.acceptable_elements: - if tag in self.unacceptable_elements_with_end_tag: - self.unacceptablestack += 1 - return - attrs = self.normalize_attrs(attrs) - attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] - _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - - def unknown_endtag(self, tag): - if not tag in self.acceptable_elements: - if tag in self.unacceptable_elements_with_end_tag: - self.unacceptablestack -= 1 - return - _BaseHTMLProcessor.unknown_endtag(self, tag) - - def handle_pi(self, text): - pass - - def handle_decl(self, text): - pass - - def handle_data(self, text): - if not self.unacceptablestack: - _BaseHTMLProcessor.handle_data(self, text) - -def _sanitizeHTML(htmlSource, encoding): - p = _HTMLSanitizer(encoding) - p.feed(htmlSource) - data = p.output() - if TIDY_MARKUP: - # loop through list of preferred Tidy interfaces looking for one that's installed, - # then set up a common _tidy function to wrap the interface-specific API. - _tidy = None - for tidy_interface in PREFERRED_TIDY_INTERFACES: - try: - if tidy_interface == "uTidy": - from tidy import parseString as _utidy - def _tidy(data, **kwargs): - return str(_utidy(data, **kwargs)) - break - elif tidy_interface == "mxTidy": - from mx.Tidy import Tidy as _mxtidy - def _tidy(data, **kwargs): - nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) - return data - break - except: - pass - if _tidy: - utf8 = type(data) == type(u'') - if utf8: - data = data.encode('utf-8') - data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") - if utf8: - data = unicode(data, 'utf-8') - if data.count(''): - data = data.split('>', 1)[1] - if data.count('= '2.3.3' - assert base64 != None - user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') - realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] - self.add_password(realm, host, user, passw) - retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) - self.reset_retry_count() - return retry - except: - return self.http_error_default(req, fp, code, msg, headers) - -def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): - """URL, filename, or string --> stream - - This function lets you define parsers that take any input source - (URL, pathname to local or network file, or actual data as a string) - and deal with it in a uniform manner. Returned object is guaranteed - to have all the basic stdio read methods (read, readline, readlines). - Just .close() the object when you're done with it. - - If the etag argument is supplied, it will be used as the value of an - If-None-Match request header. - - If the modified argument is supplied, it must be a tuple of 9 integers - as returned by gmtime() in the standard Python time module. This MUST - be in GMT (Greenwich Mean Time). The formatted date/time will be used - as the value of an If-Modified-Since request header. - - If the agent argument is supplied, it will be used as the value of a - User-Agent request header. - - If the referrer argument is supplied, it will be used as the value of a - Referer[sic] request header. - - If handlers is supplied, it is a list of handlers used to build a - urllib2 opener. - """ - - if hasattr(url_file_stream_or_string, 'read'): - return url_file_stream_or_string - - if url_file_stream_or_string == '-': - return sys.stdin - - if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): - if not agent: - agent = USER_AGENT - # test for inline user:password for basic auth - auth = None - if base64: - urltype, rest = urllib.splittype(url_file_stream_or_string) - realhost, rest = urllib.splithost(rest) - if realhost: - user_passwd, realhost = urllib.splituser(realhost) - if user_passwd: - url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) - auth = base64.encodestring(user_passwd).strip() - # try to open with urllib2 (to use optional headers) - request = urllib2.Request(url_file_stream_or_string) - request.add_header('User-Agent', agent) - if etag: - request.add_header('If-None-Match', etag) - if modified: - # format into an RFC 1123-compliant timestamp. We can't use - # time.strftime() since the %a and %b directives can be affected - # by the current locale, but RFC 2616 states that dates must be - # in English. - short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) - if referrer: - request.add_header('Referer', referrer) - if gzip and zlib: - request.add_header('Accept-encoding', 'gzip, deflate') - elif gzip: - request.add_header('Accept-encoding', 'gzip') - elif zlib: - request.add_header('Accept-encoding', 'deflate') - else: - request.add_header('Accept-encoding', '') - if auth: - request.add_header('Authorization', 'Basic %s' % auth) - if ACCEPT_HEADER: - request.add_header('Accept', ACCEPT_HEADER) - request.add_header('A-IM', 'feed') # RFC 3229 support - opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) - opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent - try: - return opener.open(request) - finally: - opener.close() # JohnD - - # try to open with native open function (if url_file_stream_or_string is a filename) - try: - return open(url_file_stream_or_string) - except: - pass - - # treat url_file_stream_or_string as string - return _StringIO(str(url_file_stream_or_string)) - -_date_handlers = [] -def registerDateHandler(func): - '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' - _date_handlers.insert(0, func) - -# ISO-8601 date parsing routines written by Fazal Majid. -# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 -# parser is beyond the scope of feedparser and would be a worthwhile addition -# to the Python library. -# A single regular expression cannot parse ISO 8601 date formats into groups -# as the standard is highly irregular (for instance is 030104 2003-01-04 or -# 0301-04-01), so we use templates instead. -# Please note the order in templates is significant because we need a -# greedy match. -_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', - 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', - '-YY-?MM', '-OOO', '-YY', - '--MM-?DD', '--MM', - '---DD', - 'CC', ''] -_iso8601_re = [ - tmpl.replace( - 'YYYY', r'(?P\d{4})').replace( - 'YY', r'(?P\d\d)').replace( - 'MM', r'(?P[01]\d)').replace( - 'DD', r'(?P[0123]\d)').replace( - 'OOO', r'(?P[0123]\d\d)').replace( - 'CC', r'(?P\d\d$)') - + r'(T?(?P\d{2}):(?P\d{2})' - + r'(:(?P\d{2}))?' - + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' - for tmpl in _iso8601_tmpl] -del tmpl -_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -del regex -def _parse_date_iso8601(dateString): - '''Parse a variety of ISO-8601-compatible formats like 20040105''' - m = None - for _iso8601_match in _iso8601_matches: - m = _iso8601_match(dateString) - if m: break - if not m: return - if m.span() == (0, 0): return - params = m.groupdict() - ordinal = params.get('ordinal', 0) - if ordinal: - ordinal = int(ordinal) - else: - ordinal = 0 - year = params.get('year', '--') - if not year or year == '--': - year = time.gmtime()[0] - elif len(year) == 2: - # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 - year = 100 * int(time.gmtime()[0] / 100) + int(year) - else: - year = int(year) - month = params.get('month', '-') - if not month or month == '-': - # ordinals are NOT normalized by mktime, we simulate them - # by setting month=1, day=ordinal - if ordinal: - month = 1 - else: - month = time.gmtime()[1] - month = int(month) - day = params.get('day', 0) - if not day: - # see above - if ordinal: - day = ordinal - elif params.get('century', 0) or \ - params.get('year', 0) or params.get('month', 0): - day = 1 - else: - day = time.gmtime()[2] - else: - day = int(day) - # special case of the century - is the first year of the 21st century - # 2000 or 2001 ? The debate goes on... - if 'century' in params.keys(): - year = (int(params['century']) - 1) * 100 + 1 - # in ISO 8601 most fields are optional - for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: - if not params.get(field, None): - params[field] = 0 - hour = int(params.get('hour', 0)) - minute = int(params.get('minute', 0)) - second = int(params.get('second', 0)) - # weekday is normalized by mktime(), we can ignore it - weekday = 0 - # daylight savings is complex, but not needed for feedparser's purposes - # as time zones, if specified, include mention of whether it is active - # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and - # and most implementations have DST bugs - daylight_savings_flag = 0 - tm = [year, month, day, hour, minute, second, weekday, - ordinal, daylight_savings_flag] - # ISO 8601 time zone adjustments - tz = params.get('tz') - if tz and tz != 'Z': - if tz[0] == '-': - tm[3] += int(params.get('tzhour', 0)) - tm[4] += int(params.get('tzmin', 0)) - elif tz[0] == '+': - tm[3] -= int(params.get('tzhour', 0)) - tm[4] -= int(params.get('tzmin', 0)) - else: - return None - # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) - # which is guaranteed to normalize d/m/y/h/m/s. - # Many implementations have bugs, but we'll pretend they don't. - return time.localtime(time.mktime(tm)) -registerDateHandler(_parse_date_iso8601) - -# 8-bit date handling routines written by ytrewq1. -_korean_year = u'\ub144' # b3e2 in euc-kr -_korean_month = u'\uc6d4' # bff9 in euc-kr -_korean_day = u'\uc77c' # c0cf in euc-kr -_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr - -_korean_onblog_date_re = \ - re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ - (_korean_year, _korean_month, _korean_day)) -_korean_nate_date_re = \ - re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ - (_korean_am, _korean_pm)) -def _parse_date_onblog(dateString): - '''Parse a string according to the OnBlog 8-bit date format''' - m = _korean_onblog_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_onblog) - -def _parse_date_nate(dateString): - '''Parse a string according to the Nate 8-bit date format''' - m = _korean_nate_date_re.match(dateString) - if not m: return - hour = int(m.group(5)) - ampm = m.group(4) - if (ampm == _korean_pm): - hour += 12 - hour = str(hour) - if len(hour) == 1: - hour = '0' + hour - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_nate) - -_mssql_date_re = \ - re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') -def _parse_date_mssql(dateString): - '''Parse a string according to the MS SQL date format''' - m = _mssql_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_mssql) - -# Unicode strings for Greek date strings -_greek_months = \ - { \ - u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 - u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 - u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 - u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 - u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 - u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 - u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 - u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 - u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 - u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 - u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 - u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 - u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 - u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 - u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 - u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 - u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 - } - -_greek_wdays = \ - { \ - u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 - u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 - u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 - u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 - u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 - u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 - u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 - } - -_greek_date_format_re = \ - re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') - -def _parse_date_greek(dateString): - '''Parse a string according to a Greek 8-bit date format.''' - m = _greek_date_format_re.match(dateString) - if not m: return - try: - wday = _greek_wdays[m.group(1)] - month = _greek_months[m.group(3)] - except: - return - rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ - {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ - 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': m.group(8)} - if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) - return _parse_date_rfc822(rfc822date) -registerDateHandler(_parse_date_greek) - -# Unicode strings for Hungarian date strings -_hungarian_months = \ - { \ - u'janu\u00e1r': u'01', # e1 in iso-8859-2 - u'febru\u00e1ri': u'02', # e1 in iso-8859-2 - u'm\u00e1rcius': u'03', # e1 in iso-8859-2 - u'\u00e1prilis': u'04', # e1 in iso-8859-2 - u'm\u00e1ujus': u'05', # e1 in iso-8859-2 - u'j\u00fanius': u'06', # fa in iso-8859-2 - u'j\u00falius': u'07', # fa in iso-8859-2 - u'augusztus': u'08', - u'szeptember': u'09', - u'okt\u00f3ber': u'10', # f3 in iso-8859-2 - u'november': u'11', - u'december': u'12', - } - -_hungarian_date_format_re = \ - re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') - -def _parse_date_hungarian(dateString): - '''Parse a string according to a Hungarian 8-bit date format.''' - m = _hungarian_date_format_re.match(dateString) - if not m: return - try: - month = _hungarian_months[m.group(2)] - day = m.group(3) - if len(day) == 1: - day = '0' + day - hour = m.group(4) - if len(hour) == 1: - hour = '0' + hour - except: - return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ - {'year': m.group(1), 'month': month, 'day': day,\ - 'hour': hour, 'minute': m.group(5),\ - 'zonediff': m.group(6)} - if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_hungarian) - -# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by -# Drake and licensed under the Python license. Removed all range checking -# for month, day, hour, minute, and second, since mktime will normalize -# these later -def _parse_date_w3dtf(dateString): - def __extract_date(m): - year = int(m.group('year')) - if year < 100: - year = 100 * int(time.gmtime()[0] / 100) + int(year) - if year < 1000: - return 0, 0, 0 - julian = m.group('julian') - if julian: - julian = int(julian) - month = julian / 30 + 1 - day = julian % 30 + 1 - jday = None - while jday != julian: - t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) - jday = time.gmtime(t)[-2] - diff = abs(jday - julian) - if jday > julian: - if diff < day: - day = day - diff - else: - month = month - 1 - day = 31 - elif jday < julian: - if day + diff < 28: - day = day + diff - else: - month = month + 1 - return year, month, day - month = m.group('month') - day = 1 - if month is None: - month = 1 - else: - month = int(month) - day = m.group('day') - if day: - day = int(day) - else: - day = 1 - return year, month, day - - def __extract_time(m): - if not m: - return 0, 0, 0 - hours = m.group('hours') - if not hours: - return 0, 0, 0 - hours = int(hours) - minutes = int(m.group('minutes')) - seconds = m.group('seconds') - if seconds: - seconds = int(seconds) - else: - seconds = 0 - return hours, minutes, seconds - - def __extract_tzd(m): - '''Return the Time Zone Designator as an offset in seconds from UTC.''' - if not m: - return 0 - tzd = m.group('tzd') - if not tzd: - return 0 - if tzd == 'Z': - return 0 - hours = int(m.group('tzdhours')) - minutes = m.group('tzdminutes') - if minutes: - minutes = int(minutes) - else: - minutes = 0 - offset = (hours*60 + minutes) * 60 - if tzd[0] == '+': - return -offset - return offset - - __date_re = ('(?P\d\d\d\d)' - '(?:(?P-|)' - '(?:(?P\d\d\d)' - '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') - __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' - __tzd_rx = re.compile(__tzd_re) - __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' - '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' - + __tzd_re) - __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) - __datetime_rx = re.compile(__datetime_re) - m = __datetime_rx.match(dateString) - if (m is None) or (m.group() != dateString): return - gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) - if gmt[0] == 0: return - return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) -registerDateHandler(_parse_date_w3dtf) - -def _parse_date_rfc822(dateString): - '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' - data = dateString.split() - if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: - del data[0] - if len(data) == 4: - s = data[3] - i = s.find('+') - if i > 0: - data[3:] = [s[:i], s[i+1:]] - else: - data.append('') - dateString = " ".join(data) - if len(data) < 5: - dateString += ' 00:00:00 GMT' - tm = rfc822.parsedate_tz(dateString) - if tm: - return time.gmtime(rfc822.mktime_tz(tm)) -# rfc822.py defines several time zones, but we define some extra ones. -# 'ET' is equivalent to 'EST', etc. -_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} -rfc822._timezones.update(_additional_timezones) -registerDateHandler(_parse_date_rfc822) - -def _parse_date(dateString): - '''Parses a variety of date formats into a 9-tuple in GMT''' - for handler in _date_handlers: - try: - date9tuple = handler(dateString) - if not date9tuple: continue - if len(date9tuple) != 9: - if _debug: sys.stderr.write('date handler function must return 9-tuple\n') - raise ValueError - map(int, date9tuple) - return date9tuple - except Exception, e: - if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) - pass - return None - -def _getCharacterEncoding(http_headers, xml_data): - '''Get the character encoding of the XML document - - http_headers is a dictionary - xml_data is a raw string (not Unicode) - - This is so much trickier than it sounds, it's not even funny. - According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type - is application/xml, application/*+xml, - application/xml-external-parsed-entity, or application/xml-dtd, - the encoding given in the charset parameter of the HTTP Content-Type - takes precedence over the encoding given in the XML prefix within the - document, and defaults to 'utf-8' if neither are specified. But, if - the HTTP Content-Type is text/xml, text/*+xml, or - text/xml-external-parsed-entity, the encoding given in the XML prefix - within the document is ALWAYS IGNORED and only the encoding given in - the charset parameter of the HTTP Content-Type header should be - respected, and it defaults to 'us-ascii' if not specified. - - Furthermore, discussion on the atom-syntax mailing list with the - author of RFC 3023 leads me to the conclusion that any document - served with a Content-Type of text/* and no charset parameter - must be treated as us-ascii. (We now do this.) And also that it - must always be flagged as non-well-formed. (We now do this too.) - - If Content-Type is unspecified (input was local file or non-HTTP source) - or unrecognized (server just got it totally wrong), then go by the - encoding given in the XML prefix of the document and default to - 'iso-8859-1' as per the HTTP specification (RFC 2616). - - Then, assuming we didn't find a character encoding in the HTTP headers - (and the HTTP Content-type allowed us to look in the body), we need - to sniff the first few bytes of the XML data and try to determine - whether the encoding is ASCII-compatible. Section F of the XML - specification shows the way here: - http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - - If the sniffed encoding is not ASCII-compatible, we need to make it - ASCII compatible so that we can sniff further into the XML declaration - to find the encoding attribute, which will tell us the true encoding. - - Of course, none of this guarantees that we will be able to parse the - feed in the declared character encoding (assuming it was declared - correctly, which many are not). CJKCodecs and iconv_codec help a lot; - you should definitely install them if you can. - http://cjkpython.i18n.org/ - ''' - - def _parseHTTPContentType(content_type): - '''takes HTTP Content-Type header and returns (content type, charset) - - If no charset is specified, returns (content type, '') - If no content type is specified, returns ('', '') - Both return parameters are guaranteed to be lowercase strings - ''' - content_type = content_type or '' - content_type, params = cgi.parse_header(content_type) - return content_type, params.get('charset', '').replace("'", '') - - sniffed_xml_encoding = '' - xml_encoding = '' - true_encoding = '' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) - # Must sniff for non-ASCII-compatible character encodings before - # searching for XML declaration. This heuristic is defined in - # section F of the XML specification: - # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = _ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - # ASCII-compatible - pass - xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) - except: - xml_encoding_match = None - if xml_encoding_match: - xml_encoding = xml_encoding_match.groups()[0].lower() - if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - acceptable_content_type = 0 - application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') - text_content_types = ('text/xml', 'text/xml-external-parsed-entity') - if (http_content_type in application_content_types) or \ - (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): - acceptable_content_type = 1 - true_encoding = http_encoding or xml_encoding or 'utf-8' - elif (http_content_type in text_content_types) or \ - (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): - acceptable_content_type = 1 - true_encoding = http_encoding or 'us-ascii' - elif http_content_type.startswith('text/'): - true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not http_headers.has_key('content-type')): - true_encoding = xml_encoding or 'iso-8859-1' - else: - true_encoding = xml_encoding or 'utf-8' - return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type - -def _toUTF8(data, encoding): - '''Changes an XML data stream on the fly to specify a new encoding - - data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already - encoding is a string recognized by encodings.aliases - ''' - if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16be': - sys.stderr.write('trying utf-16be instead\n') - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16le': - sys.stderr.write('trying utf-16le instead\n') - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-8': - sys.stderr.write('trying utf-8 instead\n') - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32be': - sys.stderr.write('trying utf-32be instead\n') - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32le': - sys.stderr.write('trying utf-32le instead\n') - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding) - if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) - declmatch = re.compile('^<\?xml[^>]*?>') - newdecl = '''''' - if declmatch.search(newdata): - newdata = declmatch.sub(newdecl, newdata) - else: - newdata = newdecl + u'\n' + newdata - return newdata.encode('utf-8') - -def _stripDoctype(data): - '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) - - rss_version may be 'rss091n' or None - stripped_data is the same XML document, minus the DOCTYPE - ''' - entity_pattern = re.compile(r']*?)>', re.MULTILINE) - data = entity_pattern.sub('', data) - doctype_pattern = re.compile(r']*?)>', re.MULTILINE) - doctype_results = doctype_pattern.findall(data) - doctype = doctype_results and doctype_results[0] or '' - if doctype.lower().count('netscape'): - version = 'rss091n' - else: - version = None - data = doctype_pattern.sub('', data) - return version, data - -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): - '''Parse a feed from a URL, file, stream, or string''' - result = FeedParserDict() - result['feed'] = FeedParserDict() - result['entries'] = [] - if _XML_AVAILABLE: - result['bozo'] = 0 - if type(handlers) == types.InstanceType: - handlers = [handlers] - try: - f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) - data = f.read() - except Exception, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - f = None - - # if feed is gzip-compressed, decompress it - if f and data and hasattr(f, 'headers'): - if gzip and f.headers.get('content-encoding', '') == 'gzip': - try: - data = gzip.GzipFile(fileobj=_StringIO(data)).read() - except Exception, e: - # Some feeds claim to be gzipped but they're not, so - # we get garbage. Ideally, we should re-request the - # feed without the 'Accept-encoding: gzip' header, - # but we don't. - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - elif zlib and f.headers.get('content-encoding', '') == 'deflate': - try: - data = zlib.decompress(data, -zlib.MAX_WBITS) - except Exception, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - - # save HTTP headers - if hasattr(f, 'info'): - info = f.info() - result['etag'] = info.getheader('ETag') - last_modified = info.getheader('Last-Modified') - if last_modified: - result['modified'] = _parse_date(last_modified) - if hasattr(f, 'url'): - result['href'] = f.url - result['status'] = 200 - if hasattr(f, 'status'): - result['status'] = f.status - if hasattr(f, 'headers'): - result['headers'] = f.headers.dict - if hasattr(f, 'close'): - f.close() - - # there are four encodings to keep track of: - # - http_encoding is the encoding declared in the Content-Type HTTP header - # - xml_encoding is the encoding declared in the ; changed -# project name -#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); -# removed unnecessary urllib code -- urllib2 should always be available anyway; -# return actual url, status, and full HTTP headers (as result['url'], -# result['status'], and result['headers']) if parsing a remote feed over HTTP -- -# this should pass all the HTTP tests at ; -# added the latest namespace-of-the-week for RSS 2.0 -#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom -# User-Agent (otherwise urllib2 sends two, which confuses some servers) -#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for -# inline and as used in some RSS 2.0 feeds -#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or -# textInput, and also to return the character encoding (if specified) -#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking -# nested divs within content (JohnD); fixed missing sys import (JohanS); -# fixed regular expression to capture XML character encoding (Andrei); -# added support for Atom 0.3-style links; fixed bug with textInput tracking; -# added support for cloud (MartijnP); added support for multiple -# category/dc:subject (MartijnP); normalize content model: 'description' gets -# description (which can come from description, summary, or full content if no -# description), 'content' gets dict of base/language/type/value (which can come -# from content:encoded, xhtml:body, content, or fullitem); -# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang -# tracking; fixed bug tracking unknown tags; fixed bug tracking content when -# element is not in default namespace (like Pocketsoap feed); -# resolve relative URLs in link, guid, docs, url, comments, wfw:comment, -# wfw:commentRSS; resolve relative URLs within embedded HTML markup in -# description, xhtml:body, content, content:encoded, title, subtitle, -# summary, info, tagline, and copyright; added support for pingback and -# trackback namespaces -#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback -# namespaces, as opposed to 2.6 when I said I did but didn't really; -# sanitize HTML markup within some elements; added mxTidy support (if -# installed) to tidy HTML markup within some elements; fixed indentation -# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available -# (FazalM); universal date parsing and normalization (FazalM): 'created', modified', -# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', -# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' -# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa -#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory -# leak not closing url opener (JohnD); added dc:publisher support (MarekK); -# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) -#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in -# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); -# fixed relative URI processing for guid (skadz); added ICBM support; added -# base64 support -#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many -# blogspot.com sites); added _debug variable -#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing -#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); -# added several new supported namespaces; fixed bug tracking naked markup in -# description; added support for enclosure; added support for source; re-added -# support for cloud which got dropped somehow; added support for expirationDate -#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking -# xml:base URI, one for documents that don't define one explicitly and one for -# documents that define an outer and an inner xml:base that goes out of scope -# before the end of the document -#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level -#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] -# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; -# added support for creativeCommons:license and cc:license; added support for -# full Atom content model in title, tagline, info, copyright, summary; fixed bug -# with gzip encoding (not always telling server we support it when we do) -#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail -# (dictionary of 'name', 'url', 'email'); map author to author_detail if author -# contains name + email address -#3.0b8 - 1/28/2004 - MAP - added support for contributor -#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added -# support for summary -#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from -# xml.util.iso8601 -#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain -# dangerous markup; fiddled with decodeEntities (not right); liberalized -# date parsing even further -#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); -# added support to Atom 0.2 subtitle; added support for Atom content model -# in copyright; better sanitizing of dangerous HTML elements with end tags -# (script, frameset) -#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, -# etc.) in embedded markup, in either HTML or XHTML form (
,
,
) -#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under -# Python 2.1 -#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; -# fixed bug capturing author and contributor URL; fixed bug resolving relative -# links in author and contributor URL; fixed bug resolvin relative links in -# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's -# namespace tests, and included them permanently in the test suite with his -# permission; fixed namespace handling under Python 2.1 -#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) -#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 -#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); -# use libxml2 (if available) -#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author -# name was in parentheses; removed ultra-problematic mxTidy support; patch to -# workaround crash in PyXML/expat when encountering invalid entities -# (MarkMoraes); support for textinput/textInput -#3.0b20 - 4/7/2004 - MAP - added CDF support -#3.0b21 - 4/14/2004 - MAP - added Hot RSS support -#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in -# results dict; changed results dict to allow getting values with results.key -# as well as results[key]; work around embedded illformed HTML with half -# a DOCTYPE; work around malformed Content-Type header; if character encoding -# is wrong, try several common ones before falling back to regexes (if this -# works, bozo_exception is set to CharacterEncodingOverride); fixed character -# encoding issues in BaseHTMLProcessor by tracking encoding and converting -# from Unicode to raw strings before feeding data to sgmllib.SGMLParser; -# convert each value in results to Unicode (if possible), even if using -# regex-based parsing -#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain -# high-bit characters in attributes in embedded HTML in description (thanks -# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in -# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking -# about a mapped key -#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and -# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could -# cause the same encoding to be tried twice (even if it failed the first time); -# fixed DOCTYPE stripping when DOCTYPE contained entity declarations; -# better textinput and image tracking in illformed RSS 1.0 feeds -#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed -# my blink tag tests -#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that -# failed to parse utf-16 encoded feeds; made source into a FeedParserDict; -# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; -# added support for image; refactored parse() fallback logic to try other -# encodings if SAX parsing fails (previously it would only try other encodings -# if re-encoding failed); remove unichr madness in normalize_attrs now that -# we're properly tracking encoding in and out of BaseHTMLProcessor; set -# feed.language from root-level xml:lang; set entry.id from rdf:about; -# send Accept header -#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between -# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are -# windows-1252); fixed regression that could cause the same encoding to be -# tried twice (even if it failed the first time) -#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; -# recover from malformed content-type header parameter with no equals sign -# ('text/xml; charset:iso-8859-1') -#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities -# to Unicode equivalents in illformed feeds (aaronsw); added and -# passed tests for converting character entities to Unicode equivalents -# in illformed feeds (aaronsw); test for valid parsers when setting -# XML_AVAILABLE; make version and encoding available when server returns -# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like -# digest auth or proxy support); add code to parse username/password -# out of url and send as basic authentication; expose downloading-related -# exceptions in bozo_exception (aaronsw); added __contains__ method to -# FeedParserDict (aaronsw); added publisher_detail (aaronsw) -#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always -# convert feed to UTF-8 before passing to XML parser; completely revamped -# logic for determining character encoding and attempting XML parsing -# (much faster); increased default timeout to 20 seconds; test for presence -# of Location header on redirects; added tests for many alternate character -# encodings; support various EBCDIC encodings; support UTF-16BE and -# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support -# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no -# XML parsers are available; added support for 'Content-encoding: deflate'; -# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules -# are available -#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure -# problem tracking xml:base and xml:lang if element declares it, child -# doesn't, first grandchild redeclares it, and second grandchild doesn't; -# refactored date parsing; defined public registerDateHandler so callers -# can add support for additional date formats at runtime; added support -# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added -# zopeCompatibilityHack() which turns FeedParserDict into a regular -# dictionary, required for Zope compatibility, and also makes command- -# line debugging easier because pprint module formats real dictionaries -# better than dictionary-like objects; added NonXMLContentType exception, -# which is stored in bozo_exception when a feed is served with a non-XML -# media type such as 'text/plain'; respect Content-Language as default -# language if not xml:lang is present; cloud dict is now FeedParserDict; -# generator dict is now FeedParserDict; better tracking of xml:lang, -# including support for xml:lang='' to unset the current language; -# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default -# namespace; don't overwrite final status on redirects (scenarios: -# redirecting to a URL that returns 304, redirecting to a URL that -# redirects to another URL with a different type of redirect); add -# support for HTTP 303 redirects -#4.0 - MAP - support for relative URIs in xml:base attribute; fixed -# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; -# support for Atom 1.0; support for iTunes extensions; new 'tags' for -# categories/keywords/etc. as array of dict -# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 -# terminology; parse RFC 822-style dates with no time; lots of other -# bug fixes -#4.1 - MAP - removed socket timeout; added support for chardet library diff --git a/code/planet/htmltmpl.py b/code/planet/htmltmpl.py deleted file mode 100644 index be6e41bb..00000000 --- a/code/planet/htmltmpl.py +++ /dev/null @@ -1,1480 +0,0 @@ - -""" A templating engine for separation of code and HTML. - - The documentation of this templating engine is separated to two parts: - - 1. Description of the templating language. - - 2. Documentation of classes and API of this module that provides - a Python implementation of the templating language. - - All the documentation can be found in 'doc' directory of the - distribution tarball or at the homepage of the engine. - Latest versions of this module are also available at that website. - - You can use and redistribute this module under conditions of the - GNU General Public License that can be found either at - [ http://www.gnu.org/ ] or in file "LICENSE" contained in the - distribution tarball of this module. - - Copyright (c) 2001 Tomas Styblo, tripie@cpan.org - - @name htmltmpl - @version 1.22 - @author-name Tomas Styblo - @author-email tripie@cpan.org - @website http://htmltmpl.sourceforge.net/ - @license-name GNU GPL - @license-url http://www.gnu.org/licenses/gpl.html -""" - -__version__ = 1.22 -__author__ = "Tomas Styblo (tripie@cpan.org)" - -# All imported modules are part of the standard Python library. - -from types import * -import re -import os -import os.path -import pprint # only for debugging -import sys -import copy -import cgi # for HTML escaping of variables -import urllib # for URL escaping of variables -import cPickle # for template compilation -import gettext - -INCLUDE_DIR = "inc" - -# Total number of possible parameters. -# Increment if adding a parameter to any statement. -PARAMS_NUMBER = 3 - -# Relative positions of parameters in TemplateCompiler.tokenize(). -PARAM_NAME = 1 -PARAM_ESCAPE = 2 -PARAM_GLOBAL = 3 -PARAM_GETTEXT_STRING = 1 - -# Find a way to lock files. Currently implemented only for UNIX and windows. -LOCKTYPE_FCNTL = 1 -LOCKTYPE_MSVCRT = 2 -LOCKTYPE = None -try: - import fcntl -except: - try: - import msvcrt - except: - LOCKTYPE = None - else: - LOCKTYPE = LOCKTYPE_MSVCRT -else: - LOCKTYPE = LOCKTYPE_FCNTL -LOCK_EX = 1 -LOCK_SH = 2 -LOCK_UN = 3 - -############################################## -# CLASS: TemplateManager # -############################################## - -class TemplateManager: - """ Class that manages compilation and precompilation of templates. - - You should use this class whenever you work with templates - that are stored in a file. The class can create a compiled - template and transparently manage its precompilation. It also - keeps the precompiled templates up-to-date by modification times - comparisons. - """ - - def __init__(self, include=1, max_include=5, precompile=1, comments=1, - gettext=0, debug=0): - """ Constructor. - - @header - __init__(include=1, max_include=5, precompile=1, comments=1, - gettext=0, debug=0) - - @param include Enable or disable included templates. - This optional parameter can be used to enable or disable - TMPL_INCLUDE inclusion of templates. Disabling of - inclusion can improve performance a bit. The inclusion is - enabled by default. - - @param max_include Maximum depth of nested inclusions. - This optional parameter can be used to specify maximum depth of - nested TMPL_INCLUDE inclusions. It defaults to 5. - This setting prevents infinite recursive inclusions. - - @param precompile Enable or disable precompilation of templates. - This optional parameter can be used to enable or disable - creation and usage of precompiled templates. - - A precompiled template is saved to the same directory in - which the main template file is located. You need write - permissions to that directory. - - Precompilation provides a significant performance boost because - it's not necessary to parse the templates over and over again. - The boost is especially noticeable when templates that include - other templates are used. - - Comparison of modification times of the main template and all - included templates is used to ensure that the precompiled - templates are up-to-date. Templates are also recompiled if the - htmltmpl module is updated. - - The TemplateErrorexception is raised when the precompiled - template cannot be saved. Precompilation is enabled by default. - - Precompilation is available only on UNIX and Windows platforms, - because proper file locking which is necessary to ensure - multitask safe behaviour is platform specific and is not - implemented for other platforms. Attempts to enable precompilation - on the other platforms result in raise of the - TemplateError exception. - - @param comments Enable or disable template comments. - This optional parameter can be used to enable or disable - template comments. - Disabling of the comments can improve performance a bit. - Comments are enabled by default. - - @param gettext Enable or disable gettext support. - - @param debug Enable or disable debugging messages. - This optional parameter is a flag that can be used to enable - or disable debugging messages which are printed to the standard - error output. The debugging messages are disabled by default. - """ - # Save the optional parameters. - # These values are not modified by any method. - self._include = include - self._max_include = max_include - self._precompile = precompile - self._comments = comments - self._gettext = gettext - self._debug = debug - - # Find what module to use to lock files. - # File locking is necessary for the 'precompile' feature to be - # multitask/thread safe. Currently it works only on UNIX - # and Windows. Anyone willing to implement it on Mac ? - if precompile and not LOCKTYPE: - raise TemplateError, "Template precompilation is not "\ - "available on this platform." - self.DEB("INIT DONE") - - def prepare(self, file): - """ Preprocess, parse, tokenize and compile the template. - - If precompilation is enabled then this method tries to load - a precompiled form of the template from the same directory - in which the template source file is located. If it succeeds, - then it compares modification times stored in the precompiled - form to modification times of source files of the template, - including source files of all templates included via the - TMPL_INCLUDE statements. If any of the modification times - differs, then the template is recompiled and the precompiled - form updated. - - If precompilation is disabled, then this method parses and - compiles the template. - - @header prepare(file) - - @return Compiled template. - The methods returns an instance of the Template class - which is a compiled form of the template. This instance can be - used as input for the TemplateProcessor. - - @param file Path to the template file to prepare. - The method looks for the template file in current directory - if the parameter is a relative path. All included templates must - be placed in subdirectory 'inc' of the - directory in which the main template file is located. - """ - compiled = None - if self._precompile: - if self.is_precompiled(file): - try: - precompiled = self.load_precompiled(file) - except PrecompiledError, template: - print >> sys.stderr, "Htmltmpl: bad precompiled "\ - "template '%s' removed" % template - compiled = self.compile(file) - self.save_precompiled(compiled) - else: - precompiled.debug(self._debug) - compile_params = (self._include, self._max_include, - self._comments, self._gettext) - if precompiled.is_uptodate(compile_params): - self.DEB("PRECOMPILED: UPTODATE") - compiled = precompiled - else: - self.DEB("PRECOMPILED: NOT UPTODATE") - compiled = self.update(precompiled) - else: - self.DEB("PRECOMPILED: NOT PRECOMPILED") - compiled = self.compile(file) - self.save_precompiled(compiled) - else: - self.DEB("PRECOMPILATION DISABLED") - compiled = self.compile(file) - return compiled - - def update(self, template): - """ Update (recompile) a compiled template. - - This method recompiles a template compiled from a file. - If precompilation is enabled then the precompiled form saved on - disk is also updated. - - @header update(template) - - @return Recompiled template. - It's ensured that the returned template is up-to-date. - - @param template A compiled template. - This parameter should be an instance of the Template - class, created either by the TemplateManager or by the - TemplateCompiler. The instance must represent a template - compiled from a file on disk. - """ - self.DEB("UPDATE") - updated = self.compile(template.file()) - if self._precompile: - self.save_precompiled(updated) - return updated - - ############################################## - # PRIVATE METHODS # - ############################################## - - def DEB(self, str): - """ Print debugging message to stderr if debugging is enabled. - @hidden - """ - if self._debug: print >> sys.stderr, str - - def lock_file(self, file, lock): - """ Provide platform independent file locking. - @hidden - """ - fd = file.fileno() - if LOCKTYPE == LOCKTYPE_FCNTL: - if lock == LOCK_SH: - fcntl.flock(fd, fcntl.LOCK_SH) - elif lock == LOCK_EX: - fcntl.flock(fd, fcntl.LOCK_EX) - elif lock == LOCK_UN: - fcntl.flock(fd, fcntl.LOCK_UN) - else: - raise TemplateError, "BUG: bad lock in lock_file" - elif LOCKTYPE == LOCKTYPE_MSVCRT: - if lock == LOCK_SH: - # msvcrt does not support shared locks :-( - msvcrt.locking(fd, msvcrt.LK_LOCK, 1) - elif lock == LOCK_EX: - msvcrt.locking(fd, msvcrt.LK_LOCK, 1) - elif lock == LOCK_UN: - msvcrt.locking(fd, msvcrt.LK_UNLCK, 1) - else: - raise TemplateError, "BUG: bad lock in lock_file" - else: - raise TemplateError, "BUG: bad locktype in lock_file" - - def compile(self, file): - """ Compile the template. - @hidden - """ - return TemplateCompiler(self._include, self._max_include, - self._comments, self._gettext, - self._debug).compile(file) - - def is_precompiled(self, file): - """ Return true if the template is already precompiled on the disk. - This method doesn't check whether the compiled template is - uptodate. - @hidden - """ - filename = file + "c" # "template.tmplc" - if os.path.isfile(filename): - return 1 - else: - return 0 - - def load_precompiled(self, file): - """ Load precompiled template from disk. - - Remove the precompiled template file and recompile it - if the file contains corrupted or unpicklable data. - - @hidden - """ - filename = file + "c" # "template.tmplc" - self.DEB("LOADING PRECOMPILED") - try: - remove_bad = 0 - file = None - try: - file = open(filename, "rb") - self.lock_file(file, LOCK_SH) - precompiled = cPickle.load(file) - except IOError, (errno, errstr): - raise TemplateError, "IO error in load precompiled "\ - "template '%s': (%d) %s"\ - % (filename, errno, errstr) - except cPickle.UnpicklingError: - remove_bad = 1 - raise PrecompiledError, filename - except: - remove_bad = 1 - raise - else: - return precompiled - finally: - if file: - self.lock_file(file, LOCK_UN) - file.close() - if remove_bad and os.path.isfile(filename): - # X: We may lose the original exception here, raising OSError. - os.remove(filename) - - def save_precompiled(self, template): - """ Save compiled template to disk in precompiled form. - - Associated metadata is also saved. It includes: filename of the - main template file, modification time of the main template file, - modification times of all included templates and version of the - htmltmpl module which compiled the template. - - The method removes a file which is saved only partially because - of some error. - - @hidden - """ - filename = template.file() + "c" # creates "template.tmplc" - # Check if we have write permission to the template's directory. - template_dir = os.path.dirname(os.path.abspath(filename)) - if not os.access(template_dir, os.W_OK): - raise TemplateError, "Cannot save precompiled templates "\ - "to '%s': write permission denied."\ - % template_dir - try: - remove_bad = 0 - file = None - try: - file = open(filename, "wb") # may truncate existing file - self.lock_file(file, LOCK_EX) - BINARY = 1 - READABLE = 0 - if self._debug: - cPickle.dump(template, file, READABLE) - else: - cPickle.dump(template, file, BINARY) - except IOError, (errno, errstr): - remove_bad = 1 - raise TemplateError, "IO error while saving precompiled "\ - "template '%s': (%d) %s"\ - % (filename, errno, errstr) - except cPickle.PicklingError, error: - remove_bad = 1 - raise TemplateError, "Pickling error while saving "\ - "precompiled template '%s': %s"\ - % (filename, error) - except: - remove_bad = 1 - raise - else: - self.DEB("SAVING PRECOMPILED") - finally: - if file: - self.lock_file(file, LOCK_UN) - file.close() - if remove_bad and os.path.isfile(filename): - # X: We may lose the original exception here, raising OSError. - os.remove(filename) - - -############################################## -# CLASS: TemplateProcessor # -############################################## - -class TemplateProcessor: - """ Fill the template with data and process it. - - This class provides actual processing of a compiled template. - Use it to set template variables and loops and then obtain - result of the processing. - """ - - def __init__(self, html_escape=1, magic_vars=1, global_vars=0, debug=0): - """ Constructor. - - @header __init__(html_escape=1, magic_vars=1, global_vars=0, - debug=0) - - @param html_escape Enable or disable HTML escaping of variables. - This optional parameter is a flag that can be used to enable or - disable automatic HTML escaping of variables. - All variables are by default automatically HTML escaped. - The escaping process substitutes HTML brackets, ampersands and - double quotes with appropriate HTML entities. - - @param magic_vars Enable or disable loop magic variables. - This parameter can be used to enable or disable - "magic" context variables, that are automatically defined inside - loops. Magic variables are enabled by default. - - Refer to the language specification for description of these - magic variables. - - @param global_vars Globally activate global lookup of variables. - This optional parameter is a flag that can be used to specify - whether variables which cannot be found in the current scope - should be automatically looked up in enclosing scopes. - - Automatic global lookup is disabled by default. Global lookup - can be overriden on a per-variable basis by the - GLOBAL parameter of a TMPL_VAR - statement. - - @param debug Enable or disable debugging messages. - """ - self._html_escape = html_escape - self._magic_vars = magic_vars - self._global_vars = global_vars - self._debug = debug - - # Data structure containing variables and loops set by the - # application. Use debug=1, process some template and - # then check stderr to see how the structure looks. - # It's modified only by set() and reset() methods. - self._vars = {} - - # Following variables are for multipart templates. - self._current_part = 1 - self._current_pos = 0 - - def set(self, var, value): - """ Associate a value with top-level template variable or loop. - - A template identifier can represent either an ordinary variable - (string) or a loop. - - To assign a value to a string identifier pass a scalar - as the 'value' parameter. This scalar will be automatically - converted to string. - - To assign a value to a loop identifier pass a list of mappings as - the 'value' parameter. The engine iterates over this list and - assigns values from the mappings to variables in a template loop - block if a key in the mapping corresponds to a name of a variable - in the loop block. The number of mappings contained in this list - is equal to number of times the loop block is repeated in the - output. - - @header set(var, value) - @return No return value. - - @param var Name of template variable or loop. - @param value The value to associate. - - """ - # The correctness of character case is verified only for top-level - # variables. - if self.is_ordinary_var(value): - # template top-level ordinary variable - if not var.islower(): - raise TemplateError, "Invalid variable name '%s'." % var - elif type(value) == ListType: - # template top-level loop - if var != var.capitalize(): - raise TemplateError, "Invalid loop name '%s'." % var - else: - raise TemplateError, "Value of toplevel variable '%s' must "\ - "be either a scalar or a list." % var - self._vars[var] = value - self.DEB("VALUE SET: " + str(var)) - - def reset(self, keep_data=0): - """ Reset the template data. - - This method resets the data contained in the template processor - instance. The template processor instance can be used to process - any number of templates, but this method must be called after - a template is processed to reuse the instance, - - @header reset(keep_data=0) - @return No return value. - - @param keep_data Do not reset the template data. - Use this flag if you do not want the template data to be erased. - This way you can reuse the data contained in the instance of - the TemplateProcessor. - """ - self._current_part = 1 - self._current_pos = 0 - if not keep_data: - self._vars.clear() - self.DEB("RESET") - - def process(self, template, part=None): - """ Process a compiled template. Return the result as string. - - This method actually processes a template and returns - the result. - - @header process(template, part=None) - @return Result of the processing as string. - - @param template A compiled template. - Value of this parameter must be an instance of the - Template class created either by the - TemplateManager or by the TemplateCompiler. - - @param part The part of a multipart template to process. - This parameter can be used only together with a multipart - template. It specifies the number of the part to process. - It must be greater than zero, because the parts are numbered - from one. - - The parts must be processed in the right order. You - cannot process a part which precedes an already processed part. - - If this parameter is not specified, then the whole template - is processed, or all remaining parts are processed. - """ - self.DEB("APP INPUT:") - if self._debug: pprint.pprint(self._vars, sys.stderr) - if part != None and (part == 0 or part < self._current_part): - raise TemplateError, "process() - invalid part number" - - # This flag means "jump behind the end of current statement" or - # "skip the parameters of current statement". - # Even parameters that actually are not present in the template - # do appear in the list of tokens as empty items ! - skip_params = 0 - - # Stack for enabling or disabling output in response to TMPL_IF, - # TMPL_UNLESS, TMPL_ELSE and TMPL_LOOPs with no passes. - output_control = [] - ENABLE_OUTPUT = 1 - DISABLE_OUTPUT = 0 - - # Stacks for data related to loops. - loop_name = [] # name of a loop - loop_pass = [] # current pass of a loop (counted from zero) - loop_start = [] # index of loop start in token list - loop_total = [] # total number of passes in a loop - - tokens = template.tokens() - len_tokens = len(tokens) - out = "" # buffer for processed output - - # Recover position at which we ended after processing of last part. - i = self._current_pos - - # Process the list of tokens. - while 1: - if i == len_tokens: break - if skip_params: - # Skip the parameters following a statement. - skip_params = 0 - i += PARAMS_NUMBER - continue - - token = tokens[i] - if token.startswith("." - escape = tokens[i + PARAM_ESCAPE] - globalp = tokens[i + PARAM_GLOBAL] - skip_params = 1 - - # If output of current block is not disabled then append - # the substitued and escaped variable to the output. - if DISABLE_OUTPUT not in output_control: - value = str(self.find_value(var, loop_name, loop_pass, - loop_total, globalp)) - out += self.escape(value, escape) - self.DEB("VAR: " + str(var)) - - elif token == "." - skip_params = 1 - - # Find total number of passes in this loop. - passtotal = self.find_value(var, loop_name, loop_pass, - loop_total) - if not passtotal: passtotal = 0 - # Push data for this loop on the stack. - loop_total.append(passtotal) - loop_start.append(i) - loop_pass.append(0) - loop_name.append(var) - - # Disable output of loop block if the number of passes - # in this loop is zero. - if passtotal == 0: - # This loop is empty. - output_control.append(DISABLE_OUTPUT) - self.DEB("LOOP: DISABLE: " + str(var)) - else: - output_control.append(ENABLE_OUTPUT) - self.DEB("LOOP: FIRST PASS: %s TOTAL: %d"\ - % (var, passtotal)) - - elif token == "." - globalp = tokens[i + PARAM_GLOBAL] - skip_params = 1 - if self.find_value(var, loop_name, loop_pass, - loop_total, globalp): - output_control.append(ENABLE_OUTPUT) - self.DEB("IF: ENABLE: " + str(var)) - else: - output_control.append(DISABLE_OUTPUT) - self.DEB("IF: DISABLE: " + str(var)) - - elif token == "." - globalp = tokens[i + PARAM_GLOBAL] - skip_params = 1 - if self.find_value(var, loop_name, loop_pass, - loop_total, globalp): - output_control.append(DISABLE_OUTPUT) - self.DEB("UNLESS: DISABLE: " + str(var)) - else: - output_control.append(ENABLE_OUTPUT) - self.DEB("UNLESS: ENABLE: " + str(var)) - - elif token == "." - - # If this loop was not disabled, then record the pass. - if loop_total[-1] > 0: loop_pass[-1] += 1 - - if loop_pass[-1] == loop_total[-1]: - # There are no more passes in this loop. Pop - # the loop from stack. - loop_pass.pop() - loop_name.pop() - loop_start.pop() - loop_total.pop() - output_control.pop() - self.DEB("LOOP: END") - else: - # Jump to the beggining of this loop block - # to process next pass of the loop. - i = loop_start[-1] - self.DEB("LOOP: NEXT PASS") - - elif token == "." - output_control.pop() - self.DEB("IF: END") - - elif token == "." - output_control.pop() - self.DEB("UNLESS: END") - - elif token == "." - if output_control[-1] == DISABLE_OUTPUT: - # Condition was false, activate the ELSE block. - output_control[-1] = ENABLE_OUTPUT - self.DEB("ELSE: ENABLE") - elif output_control[-1] == ENABLE_OUTPUT: - # Condition was true, deactivate the ELSE block. - output_control[-1] = DISABLE_OUTPUT - self.DEB("ELSE: DISABLE") - else: - raise TemplateError, "BUG: ELSE: INVALID FLAG" - - elif token == " -

- HTMLTMPL WARNING:
- Cannot include template: %s -

-
- """ % filename - self.DEB("CANNOT INCLUDE WARNING") - - elif token == "." % token - - elif DISABLE_OUTPUT not in output_control: - # Raw textual template data. - # If output of current block is not disabled, then - # append template data to the output buffer. - out += token - - i += 1 - # end of the big while loop - - # Check whether all opening statements were closed. - if loop_name: raise TemplateError, "Missing ." - if output_control: raise TemplateError, "Missing or " - return out - - ############################################## - # PRIVATE METHODS # - ############################################## - - def DEB(self, str): - """ Print debugging message to stderr if debugging is enabled. - @hidden - """ - if self._debug: print >> sys.stderr, str - - def find_value(self, var, loop_name, loop_pass, loop_total, - global_override=None): - """ Search the self._vars data structure to find variable var - located in currently processed pass of a loop which - is currently being processed. If the variable is an ordinary - variable, then return it. - - If the variable is an identificator of a loop, then - return the total number of times this loop will - be executed. - - Return an empty string, if the variable is not - found at all. - - @hidden - """ - # Search for the requested variable in magic vars if the name - # of the variable starts with "__" and if we are inside a loop. - if self._magic_vars and var.startswith("__") and loop_name: - return self.magic_var(var, loop_pass[-1], loop_total[-1]) - - # Search for an ordinary variable or for a loop. - # Recursively search in self._vars for the requested variable. - scope = self._vars - globals = [] - for i in range(len(loop_name)): - # If global lookup is on then push the value on the stack. - if ((self._global_vars and global_override != "0") or \ - global_override == "1") and scope.has_key(var) and \ - self.is_ordinary_var(scope[var]): - globals.append(scope[var]) - - # Descent deeper into the hierarchy. - if scope.has_key(loop_name[i]) and scope[loop_name[i]]: - scope = scope[loop_name[i]][loop_pass[i]] - else: - return "" - - if scope.has_key(var): - # Value exists in current loop. - if type(scope[var]) == ListType: - # The requested value is a loop. - # Return total number of its passes. - return len(scope[var]) - else: - return scope[var] - elif globals and \ - ((self._global_vars and global_override != "0") or \ - global_override == "1"): - # Return globally looked up value. - return globals.pop() - else: - # No value found. - if var[0].isupper(): - # This is a loop name. - # Return zero, because the user wants to know number - # of its passes. - return 0 - else: - return "" - - def magic_var(self, var, loop_pass, loop_total): - """ Resolve and return value of a magic variable. - Raise an exception if the magic variable is not recognized. - - @hidden - """ - self.DEB("MAGIC: '%s', PASS: %d, TOTAL: %d"\ - % (var, loop_pass, loop_total)) - if var == "__FIRST__": - if loop_pass == 0: - return 1 - else: - return 0 - elif var == "__LAST__": - if loop_pass == loop_total - 1: - return 1 - else: - return 0 - elif var == "__INNER__": - # If this is neither the first nor the last pass. - if loop_pass != 0 and loop_pass != loop_total - 1: - return 1 - else: - return 0 - elif var == "__PASS__": - # Magic variable __PASS__ counts passes from one. - return loop_pass + 1 - elif var == "__PASSTOTAL__": - return loop_total - elif var == "__ODD__": - # Internally pass numbers stored in loop_pass are counted from - # zero. But the template language presents them counted from one. - # Therefore we must add one to the actual loop_pass value to get - # the value we present to the user. - if (loop_pass + 1) % 2 != 0: - return 1 - else: - return 0 - elif var.startswith("__EVERY__"): - # Magic variable __EVERY__x is never true in first or last pass. - if loop_pass != 0 and loop_pass != loop_total - 1: - # Check if an integer follows the variable name. - try: - every = int(var[9:]) # nine is length of "__EVERY__" - except ValueError: - raise TemplateError, "Magic variable __EVERY__x: "\ - "Invalid pass number." - else: - if not every: - raise TemplateError, "Magic variable __EVERY__x: "\ - "Pass number cannot be zero." - elif (loop_pass + 1) % every == 0: - self.DEB("MAGIC: EVERY: " + str(every)) - return 1 - else: - return 0 - else: - return 0 - else: - raise TemplateError, "Invalid magic variable '%s'." % var - - def escape(self, str, override=""): - """ Escape a string either by HTML escaping or by URL escaping. - @hidden - """ - ESCAPE_QUOTES = 1 - if (self._html_escape and override != "NONE" and override != "0" and \ - override != "URL") or override == "HTML" or override == "1": - return cgi.escape(str, ESCAPE_QUOTES) - elif override == "URL": - return urllib.quote_plus(str) - else: - return str - - def is_ordinary_var(self, var): - """ Return true if var is a scalar. (not a reference to loop) - @hidden - """ - if type(var) == StringType or type(var) == IntType or \ - type(var) == LongType or type(var) == FloatType: - return 1 - else: - return 0 - - -############################################## -# CLASS: TemplateCompiler # -############################################## - -class TemplateCompiler: - """ Preprocess, parse, tokenize and compile the template. - - This class parses the template and produces a 'compiled' form - of it. This compiled form is an instance of the Template - class. The compiled form is used as input for the TemplateProcessor - which uses it to actually process the template. - - This class should be used direcly only when you need to compile - a template from a string. If your template is in a file, then you - should use the TemplateManager class which provides - a higher level interface to this class and also can save the - compiled template to disk in a precompiled form. - """ - - def __init__(self, include=1, max_include=5, comments=1, gettext=0, - debug=0): - """ Constructor. - - @header __init__(include=1, max_include=5, comments=1, gettext=0, - debug=0) - - @param include Enable or disable included templates. - @param max_include Maximum depth of nested inclusions. - @param comments Enable or disable template comments. - @param gettext Enable or disable gettext support. - @param debug Enable or disable debugging messages. - """ - - self._include = include - self._max_include = max_include - self._comments = comments - self._gettext = gettext - self._debug = debug - - # This is a list of filenames of all included templates. - # It's modified by the include_templates() method. - self._include_files = [] - - # This is a counter of current inclusion depth. It's used to prevent - # infinite recursive includes. - self._include_level = 0 - - def compile(self, file): - """ Compile template from a file. - - @header compile(file) - @return Compiled template. - The return value is an instance of the Template - class. - - @param file Filename of the template. - See the prepare() method of the TemplateManager - class for exaplanation of this parameter. - """ - - self.DEB("COMPILING FROM FILE: " + file) - self._include_path = os.path.join(os.path.dirname(file), INCLUDE_DIR) - tokens = self.parse(self.read(file)) - compile_params = (self._include, self._max_include, self._comments, - self._gettext) - return Template(__version__, file, self._include_files, - tokens, compile_params, self._debug) - - def compile_string(self, data): - """ Compile template from a string. - - This method compiles a template from a string. The - template cannot include any templates. - TMPL_INCLUDE statements are turned into warnings. - - @header compile_string(data) - @return Compiled template. - The return value is an instance of the Template - class. - - @param data String containing the template data. - """ - self.DEB("COMPILING FROM STRING") - self._include = 0 - tokens = self.parse(data) - compile_params = (self._include, self._max_include, self._comments, - self._gettext) - return Template(__version__, None, None, tokens, compile_params, - self._debug) - - ############################################## - # PRIVATE METHODS # - ############################################## - - def DEB(self, str): - """ Print debugging message to stderr if debugging is enabled. - @hidden - """ - if self._debug: print >> sys.stderr, str - - def read(self, filename): - """ Read content of file and return it. Raise an error if a problem - occurs. - @hidden - """ - self.DEB("READING: " + filename) - try: - f = None - try: - f = open(filename, "r") - data = f.read() - except IOError, (errno, errstr): - raise TemplateError, "IO error while reading template '%s': "\ - "(%d) %s" % (filename, errno, errstr) - else: - return data - finally: - if f: f.close() - - def parse(self, template_data): - """ Parse the template. This method is recursively called from - within the include_templates() method. - - @return List of processing tokens. - @hidden - """ - if self._comments: - self.DEB("PREPROCESS: COMMENTS") - template_data = self.remove_comments(template_data) - tokens = self.tokenize(template_data) - if self._include: - self.DEB("PREPROCESS: INCLUDES") - self.include_templates(tokens) - return tokens - - def remove_comments(self, template_data): - """ Remove comments from the template data. - @hidden - """ - pattern = r"### .*" - return re.sub(pattern, "", template_data) - - def include_templates(self, tokens): - """ Process TMPL_INCLUDE statements. Use the include_level counter - to prevent infinite recursion. Record paths to all included - templates to self._include_files. - @hidden - """ - i = 0 - out = "" # buffer for output - skip_params = 0 - - # Process the list of tokens. - while 1: - if i == len(tokens): break - if skip_params: - skip_params = 0 - i += PARAMS_NUMBER - continue - - token = tokens[i] - if token == "." - self._include_level += 1 - if self._include_level > self._max_include: - # Do not include the template. - # Protection against infinite recursive includes. - skip_params = 1 - self.DEB("INCLUDE: LIMIT REACHED: " + filename) - else: - # Include the template. - skip_params = 0 - include_file = os.path.join(self._include_path, filename) - self._include_files.append(include_file) - include_data = self.read(include_file) - include_tokens = self.parse(include_data) - - # Append the tokens from the included template to actual - # position in the tokens list, replacing the TMPL_INCLUDE - # token and its parameters. - tokens[i:i+PARAMS_NUMBER+1] = include_tokens - i = i + len(include_tokens) - self.DEB("INCLUDED: " + filename) - continue # Do not increment 'i' below. - i += 1 - # end of the main while loop - - if self._include_level > 0: self._include_level -= 1 - return out - - def tokenize(self, template_data): - """ Split the template into tokens separated by template statements. - The statements itself and associated parameters are also - separately included in the resulting list of tokens. - Return list of the tokens. - - @hidden - """ - self.DEB("TOKENIZING TEMPLATE") - # NOTE: The TWO double quotes in character class in the regexp below - # are there only to prevent confusion of syntax highlighter in Emacs. - pattern = r""" - (?:^[ \t]+)? # eat spaces, tabs (opt.) - (< - (?:!--[ ])? # comment start + space (opt.) - /?TMPL_[A-Z]+ # closing slash / (opt.) + statement - [ a-zA-Z0-9""/.=:_\\-]* # this spans also comments ending (--) - >) - [%s]? # eat trailing newline (opt.) - """ % os.linesep - rc = re.compile(pattern, re.VERBOSE | re.MULTILINE) - split = rc.split(template_data) - tokens = [] - for statement in split: - if statement.startswith(" 0 and '=' not in params[0]: - # implicit identifier - name = params[0] - del params[0] - else: - # explicit identifier as a 'NAME' parameter - name = self.find_param("NAME", params) - self.DEB("TOKENIZER: NAME: " + str(name)) - return name - - def find_param(self, param, params): - """ Extract value of parameter from a statement. - @hidden - """ - for pair in params: - name, value = pair.split("=") - if not name or not value: - raise TemplateError, "Syntax error in template." - if name == param: - if value[0] == '"': - # The value is in double quotes. - ret_value = value[1:-1] - else: - # The value is without double quotes. - ret_value = value - self.DEB("TOKENIZER: PARAM: '%s' => '%s'" % (param, ret_value)) - return ret_value - else: - self.DEB("TOKENIZER: PARAM: '%s' => NOT DEFINED" % param) - return None - - -############################################## -# CLASS: Template # -############################################## - -class Template: - """ This class represents a compiled template. - - This class provides storage and methods for the compiled template - and associated metadata. It's serialized by pickle if we need to - save the compiled template to disk in a precompiled form. - - You should never instantiate this class directly. Always use the - TemplateManager or TemplateCompiler classes to - create the instances of this class. - - The only method which you can directly use is the is_uptodate - method. - """ - - def __init__(self, version, file, include_files, tokens, compile_params, - debug=0): - """ Constructor. - @hidden - """ - self._version = version - self._file = file - self._tokens = tokens - self._compile_params = compile_params - self._debug = debug - self._mtime = None - self._include_mtimes = {} - - if not file: - self.DEB("TEMPLATE WAS COMPILED FROM A STRING") - return - - # Save modifitcation time of the main template file. - if os.path.isfile(file): - self._mtime = os.path.getmtime(file) - else: - raise TemplateError, "Template: file does not exist: '%s'" % file - - # Save modificaton times of all included template files. - for inc_file in include_files: - if os.path.isfile(inc_file): - self._include_mtimes[inc_file] = os.path.getmtime(inc_file) - else: - raise TemplateError, "Template: file does not exist: '%s'"\ - % inc_file - - self.DEB("NEW TEMPLATE CREATED") - - def is_uptodate(self, compile_params=None): - """ Check whether the compiled template is uptodate. - - Return true if this compiled template is uptodate. - Return false, if the template source file was changed on the - disk since it was compiled. - Works by comparison of modification times. - Also takes modification times of all included templates - into account. - - @header is_uptodate(compile_params=None) - @return True if the template is uptodate, false otherwise. - - @param compile_params Only for internal use. - Do not use this optional parameter. It's intended only for - internal use by the TemplateManager. - """ - if not self._file: - self.DEB("TEMPLATE COMPILED FROM A STRING") - return 0 - - if self._version != __version__: - self.DEB("TEMPLATE: VERSION NOT UPTODATE") - return 0 - - if compile_params != None and compile_params != self._compile_params: - self.DEB("TEMPLATE: DIFFERENT COMPILATION PARAMS") - return 0 - - # Check modification times of the main template and all included - # templates. If the included template no longer exists, then - # the problem will be resolved when the template is recompiled. - - # Main template file. - if not (os.path.isfile(self._file) and \ - self._mtime == os.path.getmtime(self._file)): - self.DEB("TEMPLATE: NOT UPTODATE: " + self._file) - return 0 - - # Included templates. - for inc_file in self._include_mtimes.keys(): - if not (os.path.isfile(inc_file) and \ - self._include_mtimes[inc_file] == \ - os.path.getmtime(inc_file)): - self.DEB("TEMPLATE: NOT UPTODATE: " + inc_file) - return 0 - else: - self.DEB("TEMPLATE: UPTODATE") - return 1 - - def tokens(self): - """ Get tokens of this template. - @hidden - """ - return self._tokens - - def file(self): - """ Get filename of the main file of this template. - @hidden - """ - return self._file - - def debug(self, debug): - """ Get debugging state. - @hidden - """ - self._debug = debug - - ############################################## - # PRIVATE METHODS # - ############################################## - - def __getstate__(self): - """ Used by pickle when the class is serialized. - Remove the 'debug' attribute before serialization. - @hidden - """ - dict = copy.copy(self.__dict__) - del dict["_debug"] - return dict - - def __setstate__(self, dict): - """ Used by pickle when the class is unserialized. - Add the 'debug' attribute. - @hidden - """ - dict["_debug"] = 0 - self.__dict__ = dict - - - def DEB(self, str): - """ Print debugging message to stderr. - @hidden - """ - if self._debug: print >> sys.stderr, str - - -############################################## -# EXCEPTIONS # -############################################## - -class TemplateError(Exception): - """ Fatal exception. Raised on runtime or template syntax errors. - - This exception is raised when a runtime error occurs or when a syntax - error in the template is found. It has one parameter which always - is a string containing a description of the error. - - All potential IOError exceptions are handled by the module and are - converted to TemplateError exceptions. That means you should catch the - TemplateError exception if there is a possibility that for example - the template file will not be accesssible. - - The exception can be raised by constructors or by any method of any - class. - - The instance is no longer usable when this exception is raised. - """ - - def __init__(self, error): - """ Constructor. - @hidden - """ - Exception.__init__(self, "Htmltmpl error: " + error) - - -class PrecompiledError(Exception): - """ This exception is _PRIVATE_ and non fatal. - @hidden - """ - - def __init__(self, template): - """ Constructor. - @hidden - """ - Exception.__init__(self, template) - diff --git a/code/planet/sanitize.py b/code/planet/sanitize.py deleted file mode 100644 index c98b14de..00000000 --- a/code/planet/sanitize.py +++ /dev/null @@ -1,354 +0,0 @@ -""" -sanitize: bringing sanitiy to world of messed-up data -""" - -__author__ = ["Mark Pilgrim ", - "Aaron Swartz "] -__contributors__ = ["Sam Ruby "] -__license__ = "BSD" -__version__ = "0.25" - -_debug = 0 - -# If you want sanitize to automatically run HTML markup through HTML Tidy, set -# this to 1. Requires mxTidy -# or utidylib . -TIDY_MARKUP = 0 - -# List of Python interfaces for HTML Tidy, in order of preference. Only useful -# if TIDY_MARKUP = 1 -PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] - -import sgmllib, re - -# chardet library auto-detects character encodings -# Download from http://chardet.feedparser.org/ -try: - import chardet - if _debug: - import chardet.constants - chardet.constants._debug = 1 - - _chardet = lambda data: chardet.detect(data)['encoding'] -except: - chardet = None - _chardet = lambda data: None - -class _BaseHTMLProcessor(sgmllib.SGMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - - _r_barebang = re.compile(r'') - - def __init__(self, encoding): - self.encoding = encoding - if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) - sgmllib.SGMLParser.__init__(self) - - def reset(self): - self.pieces = [] - sgmllib.SGMLParser.reset(self) - - def _shorttag_replace(self, match): - tag = match.group(1) - if tag in self.elements_no_end_tag: - return '<' + tag + ' />' - else: - return '<' + tag + '>' - - def feed(self, data): - data = self._r_barebang.sub(r'<!\1', data) - data = self._r_bareamp.sub("&", data) - data = self._r_shorttag.sub(self._shorttag_replace, data) - if self.encoding and type(data) == type(u''): - data = data.encode(self.encoding) - sgmllib.SGMLParser.feed(self, data) - - def normalize_attrs(self, attrs): - # utility method to be called by descendants - attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] - return attrs - - def unknown_starttag(self, tag, attrs): - # called for each start tag - # attrs is a list of (attr, value) tuples - # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
-        uattrs = []
-        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
-        for key, value in attrs:
-            if type(value) != type(u''):
-                value = unicode(value, self.encoding)
-            uattrs.append((unicode(key, self.encoding), value))
-        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
-        if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
-        else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
-
-    def unknown_endtag(self, tag):
-        # called for each end tag, e.g. for 
, tag will be 'pre' - # Reconstruct the original end tag. - if tag not in self.elements_no_end_tag: - self.pieces.append("" % locals()) - - def handle_charref(self, ref): - # called for each character reference, e.g. for ' ', ref will be '160' - # Reconstruct the original character reference. - self.pieces.append('&#%(ref)s;' % locals()) - - def handle_entityref(self, ref): - # called for each entity reference, e.g. for '©', ref will be 'copy' - # Reconstruct the original entity reference. - self.pieces.append('&%(ref)s;' % locals()) - - def handle_data(self, text): - # called for each block of plain text, i.e. outside of any tag and - # not containing any character or entity references - # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) - self.pieces.append(text) - - def handle_comment(self, text): - # called for each HTML comment, e.g. - # Reconstruct the original comment. - self.pieces.append('' % locals()) - - def handle_pi(self, text): - # called for each processing instruction, e.g. - # Reconstruct original processing instruction. - self.pieces.append('' % locals()) - - def handle_decl(self, text): - # called for the DOCTYPE, if present, e.g. - # - # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) - - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match - def _scan_name(self, i, declstartpos): - rawdata = self.rawdata - n = len(rawdata) - if i == n: - return None, -1 - m = self._new_declname_match(rawdata, i) - if m: - s = m.group() - name = s.strip() - if (i + len(s)) == n: - return None, -1 # end of buffer - return name.lower(), m.end() - else: - self.handle_data(rawdata) -# self.updatepos(declstartpos, i) - return None, -1 - - def output(self): - '''Return processed HTML as a single string''' - return ''.join([str(p) for p in self.pieces]) - -class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', - 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', - 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', - 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', - 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', - 'strong', 'sub', 'sup', 'table', 'textarea', 'tbody', 'td', 'tfoot', 'th', - 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', - 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', - 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', - 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', - 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', - 'usemap', 'valign', 'value', 'vspace', 'width'] - - ignorable_elements = ['script', 'applet', 'style'] - - def reset(self): - _BaseHTMLProcessor.reset(self) - self.tag_stack = [] - self.ignore_level = 0 - - def feed(self, data): - _BaseHTMLProcessor.feed(self, data) - while self.tag_stack: - _BaseHTMLProcessor.unknown_endtag(self, self.tag_stack.pop()) - - def unknown_starttag(self, tag, attrs): - if tag in self.ignorable_elements: - self.ignore_level += 1 - return - - if self.ignore_level: - return - - if tag in self.acceptable_elements: - attrs = self.normalize_attrs(attrs) - attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] - if tag not in self.elements_no_end_tag: - self.tag_stack.append(tag) - _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - - def unknown_endtag(self, tag): - if tag in self.ignorable_elements: - self.ignore_level -= 1 - return - - if self.ignore_level: - return - - if tag in self.acceptable_elements and tag not in self.elements_no_end_tag: - match = False - while self.tag_stack: - top = self.tag_stack.pop() - if top == tag: - match = True - break - _BaseHTMLProcessor.unknown_endtag(self, top) - - if match: - _BaseHTMLProcessor.unknown_endtag(self, tag) - - def handle_pi(self, text): - pass - - def handle_decl(self, text): - pass - - def handle_data(self, text): - if not self.ignore_level: - text = text.replace('<', '') - _BaseHTMLProcessor.handle_data(self, text) - -def HTML(htmlSource, encoding='utf8'): - p = _HTMLSanitizer(encoding) - p.feed(htmlSource) - data = p.output() - if TIDY_MARKUP: - # loop through list of preferred Tidy interfaces looking for one that's installed, - # then set up a common _tidy function to wrap the interface-specific API. - _tidy = None - for tidy_interface in PREFERRED_TIDY_INTERFACES: - try: - if tidy_interface == "uTidy": - from tidy import parseString as _utidy - def _tidy(data, **kwargs): - return str(_utidy(data, **kwargs)) - break - elif tidy_interface == "mxTidy": - from mx.Tidy import Tidy as _mxtidy - def _tidy(data, **kwargs): - nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) - return data - break - except: - pass - if _tidy: - utf8 = type(data) == type(u'') - if utf8: - data = data.encode('utf-8') - data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") - if utf8: - data = unicode(data, 'utf-8') - if data.count(''): - data = data.split('>', 1)[1] - if data.count(' - - - " /> + {{ name }} + {{ link | e }} + - + {% for channel in Channels %} - + {{ channel.name }} - "> - + + {{ channel.title }} - " /> + - + {% endfor %} diff --git a/code/python/index.html.tmpl b/code/python/index.html.tmpl index c254ed8b..d8522761 100644 --- a/code/python/index.html.tmpl +++ b/code/python/index.html.tmpl @@ -1,87 +1,49 @@ -### Planet HTML template. -### -### This is intended to demonstrate and document Planet's templating -### facilities, and at the same time provide a good base for you to -### modify into your own design. -### -### The output's a bit boring though, if you're after less documentation -### and more instant gratification, there's an example with a much -### prettier output in the fancy-examples/ directory of the Planet source. - -### Lines like this are comments, and are automatically removed by the -### templating engine before processing. - - -### Planet makes a large number of variables available for your templates. -### See INSTALL for the complete list. The raw value can be placed in your -### output file using . We'll put the name of our -### Planet in the page title and again in an h1. - -<TMPL_VAR name> +{{ name }} - + -

- -### One of the two loops available is the Channels loop. This allows you -### to easily create a list of subscriptions, which is exactly what we'll do -### here. - -### Note that we can also expand variables inside HTML tags, but we need -### to be cautious and HTML-escape any illegal characters using the form -### +

{{ name }}

Subscriptions

-### The other loop is the Items loop, which will get iterated for each -### news item. - - - -### Visually distinguish articles from different days by checking for -### the new_date flag. This demonstrates the ... -### check. - - -

-
- -### Group consecutive articles by the same author together by checking -### for the new_channel flag. +{% for item in Items %} - -

" title="">

-
+{% if item.new_date %} +

{{ item.new_date }}

+{% endif %} +{% if item.new_channel %} +

{{ item.channel_name }}

+{% endif %} - -

">

-
+{% if item.title %} +

{{ item.title }}

+{% endif %}

- +{{ item.content }}

-">by at +{% if item.creator %}by {{ item.creator }} at {% endif %}{{ item.date }}

-
+{% endfor %}

Powered by Planet!
-Last updated: +Last updated: {{ date }}

diff --git a/code/python/opml.xml.tmpl b/code/python/opml.xml.tmpl index 50bbabe3..aacaf96d 100644 --- a/code/python/opml.xml.tmpl +++ b/code/python/opml.xml.tmpl @@ -1,16 +1,16 @@ - <TMPL_VAR name> - - - - + {{ name }} + {{ date_822 }} + {{ date_822 }} + {{ owner_name }} + {{ owner_email }} - + - - " xmlUrl=""/> - + {% for channel in Channels %} + + {% endfor %} diff --git a/code/python/rss10.xml.tmpl b/code/python/rss10.xml.tmpl index ea0c2e92..f753bc7e 100644 --- a/code/python/rss10.xml.tmpl +++ b/code/python/rss10.xml.tmpl @@ -6,32 +6,32 @@ xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns="http://purl.org/rss/1.0/" > -"> - <TMPL_VAR name> - - - + + {{ name }} + {{ link | e }} + {{ name }} - {{ link | e }} - - " /> - + {% for item in Items %} + + {% endfor %} - -"> - <TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title></TMPL_IF> - - - - - - - - +{% for item in Items %} + + {{ item.channel_name }}{% if item.title %}: {{ item.title }}{% endif %} + {{ item.link | e }} + {% if item.content %} + {{ item.content | e }} + {% endif %} + {{ item.date_iso }} + {% if item.creator %} + {{ item.creator }} + {% endif %} - +{% endfor %} diff --git a/code/python/rss20.xml.tmpl b/code/python/rss20.xml.tmpl index 379e03fb..a4b54a9c 100644 --- a/code/python/rss20.xml.tmpl +++ b/code/python/rss20.xml.tmpl @@ -2,25 +2,25 @@ - <TMPL_VAR name> - + {{ name }} + {{ link | e }} en - - + {{ name }} - {{ link | e }} - - - <TMPL_VAR channel_name><TMPL_IF title>: <TMPL_VAR title></TMPL_IF> - - - - - - - - - - - + {% for item in Items %} + + {{ item.channel_name }}{% if item.title %}: {{ item.title }}{% endif %} + {{ item.id | e }} + {{ item.link | e }} + {% if item.content %} + {{ item.content | e }} + {% endif %} + {{ item.date_822 }} + {% if item.creator %} + {{ item.creator }} + {% endif %} + + {% endfor %} diff --git a/config/foafroll.xml.tmpl b/config/foafroll.xml.tmpl index f3447383..cf1506bb 100644 --- a/config/foafroll.xml.tmpl +++ b/config/foafroll.xml.tmpl @@ -7,25 +7,25 @@ xmlns:dc="http://purl.org/dc/elements/1.1/" > - - - " /> + {{ name }} + {{ link | e }} + - + {% for channel in Channels %} - + {{ channel.name }} - "> - + + {{ channel.title }} - " /> + - + {% endfor %} diff --git a/config/index.html.tmpl b/config/index.html.tmpl index 965852e8..7cf67244 100644 --- a/config/index.html.tmpl +++ b/config/index.html.tmpl @@ -2,7 +2,7 @@ - <TMPL_VAR name> + {{ name }} @@ -10,7 +10,7 @@ - + @@ -32,8 +32,7 @@

- +

skip to navigation
@@ -42,38 +41,30 @@ src="/static/images/python-logo.gif" alt="homepage" border="0" />
-

+

{{ name }}

-

Last update: +

Last update: {{ date }}

- +{% for item in Items %} -### Visually distinguish articles from different days by checking for -### the new_date flag. This demonstrates the ... -### check. +{% if item.new_date %} +

{{ item.new_date }}

+{% endif %} - -

-
+{% if item.new_channel %} +

{{ item.channel_name }}

+{% endif %} -### Group consecutive articles by the same author together by checking -### for the new_channel flag. - - -

" title="">

-
- - - -

">

-
+{% if item.title %} +

{{ item.title }}

+{% endif %}

- +{{ item.content }}

-">by at +{% if item.creator %}by {{ item.creator }} at {% endif %}{{ item.date }}

-
+{% endfor %}
@@ -127,10 +118,10 @@ src="/static/images/python-logo.gif" alt="homepage" border="0" />
  • Subscriptions
    • [OPML feed]
    • - -
    • " title=""> +{% for channel in Channels %} +
    • {{ channel.name }}
    • -
      +{% endfor %}
    • diff --git a/config/opml.xml.tmpl b/config/opml.xml.tmpl index aaa88338..f940d845 100644 --- a/config/opml.xml.tmpl +++ b/config/opml.xml.tmpl @@ -1,16 +1,15 @@ - - <TMPL_VAR name> - - - - - - - - - " xmlUrl=""/> - + + {{ name }} + {{ date_822 }} + {{ date_822 }} + {{ owner_name }} + {{ owner_email }} + + + {% for channel in Channels %} + + {% endfor %} diff --git a/config/rss10.xml.tmpl b/config/rss10.xml.tmpl index 00f8ca98..1dfdc174 100644 --- a/config/rss10.xml.tmpl +++ b/config/rss10.xml.tmpl @@ -6,32 +6,32 @@ xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns="http://purl.org/rss/1.0/" > -"> - <TMPL_VAR name ESCAPE="HTML"> - - - + + {{ name | e }} + {{ link | e }} + {{ name }} - {{ link | e }} - - " /> - + {% for item in Items %} + + {% endfor %} - -"> - <TMPL_VAR channel_name ESCAPE="HTML"><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF> - - - - - - - - +{% for item in Items %} + + {{ item.channel_name | e }}{% if item.title %}: {{ item.title | e }}{% endif %} + {{ item.link | e }} + {% if item.content %} + {{ item.content | e }} + {% endif %} + {{ item.date_iso }} + {% if item.creator %} + {{ item.creator | e }} + {% endif %} - +{% endfor %} diff --git a/config/rss20.xml.tmpl b/config/rss20.xml.tmpl index 1026dc10..5373a059 100644 --- a/config/rss20.xml.tmpl +++ b/config/rss20.xml.tmpl @@ -1,26 +1,25 @@ - - <TMPL_VAR name ESCAPE="HTML"> - + {{ name | e }} + {{ link | e }} en - - + {{ name }} - {{ link | e }} - - - <TMPL_VAR channel_name ESCAPE="HTML"><TMPL_IF title>: <TMPL_VAR title ESCAPE="HTML"></TMPL_IF> - - - - - - - - - - - + {% for item in Items %} + + {{ item.channel_name | e }}{% if item.title %}: {{ item.title | e }}{% endif%} + {{ item.link | e }} + {{ item.link | e }} + {% if item.content %} + {{ item.content | e }} + {% endif %} + {{ item.date_822 }} + {% if item.creator %} + {{ item.creator | e }} + {% endif %} + + {% endfor %} diff --git a/config/sort-ini.py b/config/sort-ini.py index 98976d3b..b3fba13d 100755 --- a/config/sort-ini.py +++ b/config/sort-ini.py @@ -1,41 +1,40 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys -import ConfigParser +from configparser import DEFAULTSECT, RawConfigParser if len(sys.argv) > 1: filename = sys.argv[1] else: - filename = 'config.ini' - -oconfig = ConfigParser.RawConfigParser() + filename = "config.ini" + +oconfig = RawConfigParser() oconfig.read(filename) # This part will destroy the configuration if there's a crash while # writing the output. We're in an GIT-controlled directory, so # I didn't care enough to fix this. -with open(filename, 'wb') as fd: +with open(filename, "wb") as fd: # Copy of write() code that sorts output by section if oconfig._defaults: fd.write("[%s]\n" % DEFAULTSECT) - for (key, value) in oconfig._defaults.items(): - fd.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t'))) + for key, value in oconfig._defaults.items(): + fd.write("{} = {}\n".format(key, str(value).replace("\n", "\n\t"))) fd.write("\n") - + result = {} for section in sorted(oconfig._sections): - if section == 'Planet': + if section == "Planet": fd.write("[%s]\n" % section) - for (key, value) in oconfig._sections[section].items(): + for key, value in oconfig._sections[section].items(): if key != "__name__": - if section == 'Planet': - fd.write("%s = %s\n" % - (key, str(value).replace('\n', '\n\t'))) + if section == "Planet": + fd.write("{} = {}\n".format(key, str(value).replace("\n", "\n\t"))) else: - result[value.replace('"', '')] = section - if section == 'Planet': + result[value.replace('"', "")] = section + if section == "Planet": fd.write("\n") - + for key, value in sorted(result.items()): fd.write("[%s]\n" % value) name = key @@ -43,4 +42,3 @@ name = '"%s"' % key fd.write("name = %s\n" % name) fd.write("\n") - diff --git a/config/summary.html.tmpl b/config/summary.html.tmpl index 844340dc..a7789b94 100644 --- a/config/summary.html.tmpl +++ b/config/summary.html.tmpl @@ -2,7 +2,7 @@ - <TMPL_VAR name> + {{ name }} @@ -10,7 +10,7 @@ - + @@ -20,8 +20,7 @@

      - +

      skip to navigation
      @@ -30,38 +29,30 @@ src="/static/images/python-logo.gif" alt="homepage" border="0" />
      -

      +

      {{ name }}

      -

      Last update: +

      Last update: {{ date }}

      - +{% for item in Items %} -### Visually distinguish articles from different days by checking for -### the new_date flag. This demonstrates the ... -### check. +{% if item.new_date %} +

      {{ item.new_date }}

      +{% endif %} - -

      -
      +{% if item.new_channel %} +

      {{ item.channel_name }}

      +{% endif %} -### Group consecutive articles by the same author together by checking -### for the new_channel flag. - - -

      " title="">

      -
      - - - -

      ">

      -
      +{% if item.title %} +

      {{ item.title }}

      +{% endif %}

      - +{{ item.summary }}

      -">by at +{% if item.creator %}by {{ item.creator }} at {% endif %}{{ item.date }}

      -
      +{% endfor %}
      @@ -98,10 +89,10 @@ src="/static/images/python-logo.gif" alt="homepage" border="0" />
  • Subscriptions
  • diff --git a/config/titles_only.html.tmpl b/config/titles_only.html.tmpl index c44a104b..51f88193 100644 --- a/config/titles_only.html.tmpl +++ b/config/titles_only.html.tmpl @@ -2,7 +2,7 @@ - <TMPL_VAR name> + {{ name }} @@ -10,19 +10,16 @@ - - + + - + - +

    - +

    skip to navigation
    @@ -31,35 +28,27 @@ src="/static/images/python-logo.gif" alt="homepage" border="0" />
    -

    +

    {{ name }}

    -

    Last update: +

    Last update: {{ date }}

    - +{% for item in Items %} -### Visually distinguish articles from different days by checking for -### the new_date flag. This demonstrates the ... -### check. +{% if item.new_date %} +

    {{ item.new_date }}

    +{% endif %} - -

    -
    +{% if item.new_channel %} +

    {{ item.channel_name }}

    +{% endif %} -### Group consecutive articles by the same author together by checking -### for the new_channel flag. - - -

    " title="">

    -
    - - - -

    ">

    -
    +{% if item.title %} +

    {{ item.title }}

    +{% endif %}

    -">by at +{% if item.creator %}by {{ item.creator }} at {% endif %}{{ item.date }}

    -
    +{% endfor %}
    @@ -86,7 +75,7 @@ src="/static/images/python-logo.gif" alt="homepage" border="0" />
  • OLPC
  • PySoy
  • SciPy
  • -
  • SymPy
  • +
  • SymPy
  • Twisted
  • Python/Web Planets @@ -113,12 +102,12 @@ src="/static/images/python-logo.gif" alt="homepage" border="0" />
  • Subscriptions
    • [OPML feed]
    • - -
    • " title=""> -
    • -
      +{% for channel in Channels %} +
    • {{ channel.name }} +
    • +{% endfor %}
    • To request addition or removal:
      -e-mail planet at python.org (note, responses can take up to a few days)
    • + e-mail planet at python.org (note, responses can take up to a few days)
  • diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..5f19b0b6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "python-planet" +version = "3.0" +description = "PythonPlanet RSS/Atom feed aggregator" +readme = { file = "code/README.pydotorg", content-type="text/markdown" } + +requires-python = ">=3.12" +dependencies = [ + "feedparser>=6.0.11", + "jinja2>=3.1.4", +] + +[[authors]] +name = "Scott James Remnant" +email = "scott@netsplit.com" + +[[authors]] +name = "Jeff Waugh" +email = "jdub@perkypants.org" + +[[authors]] +name = "Jacob Coffee" +email = "jacob@z7x.org" + +[[authors]] +name = "Chris Rose" +email = "offline@offby1.net" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["code/planet"] + +[tool.uv] +dev-dependencies = [ + "pyright>=1.1.383", + "pytest-xdist>=3.6.1", + "pytest>=8.3.3", + "ruff>=0.6.9", +] + +[tool.pytest.ini_options] +looponfailroots = ["code", "tests"] +filterwarnings = [ + # I know looponfailroots is 'deprecated' but ... i'm tired of seeing it + "ignore::DeprecationWarning:xdist.plugin" +] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..17116a4f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,39 @@ +import configparser + +import planet as planet_module +import pytest + + +@pytest.fixture(name="config") +def get_config(tmp_path): + config = configparser.ConfigParser() + ini_text = f"""\ + [Planet] + name = Test Planet + output_dir = {tmp_path}/output + cache_directory = {tmp_path}/cache + + [https://example.com/rss] + name = example rss + + [https://example.com/atom] + name = example atom + """ + + config.read_string(ini_text) + return config + + +@pytest.fixture(name="planet") +def get_planet(config): + return planet_module.Planet(config) + + +@pytest.fixture(name="rss_channel") +def get_rss_channel(planet): + return planet_module.Channel(planet, "https://example.com/rss") + + +@pytest.fixture(name="atom_channel") +def get_atom_channel(planet): + return planet_module.Channel(planet, "https://example.com/atom") diff --git a/tests/fixtures/sample_atom.xml b/tests/fixtures/sample_atom.xml new file mode 100644 index 00000000..36d152d5 --- /dev/null +++ b/tests/fixtures/sample_atom.xml @@ -0,0 +1,25 @@ + + + Example Atom Feed + + 2021-10-21T16:29:00Z + + John Doe + author@example.com + + https://example.com/atom + + Example Entry 1 + + https://example.com/entry1 + 2021-10-21T16:31:00Z + This is a summary of entry 1 + + + Example Entry 2 + + https://example.com/entry2 + 2021-10-22T16:31:00Z + This is a summary of entry 2 + + diff --git a/tests/fixtures/sample_rss.xml b/tests/fixtures/sample_rss.xml new file mode 100644 index 00000000..cc9afbdd --- /dev/null +++ b/tests/fixtures/sample_rss.xml @@ -0,0 +1,24 @@ + + + + Example RSS Feed + https://example.com/rss + This is an example of an RSS feed + + Example Item 1 + https://example.com/item1 + This is a description of item 1 + author@example.com (John Doe) + Thu, 21 Oct 2021 16:29:00 +0000 + https://example.com/item1 + + + Example Item 2 + https://example.com/item2 + This is a description of item 2 + author@example.com (John Doe) + Thu, 22 Oct 2021 16:29:00 +0000 + https://example.com/item2 + + + diff --git a/tests/test_newsitem.py b/tests/test_newsitem.py new file mode 100644 index 00000000..f89e5403 --- /dev/null +++ b/tests/test_newsitem.py @@ -0,0 +1,189 @@ +import configparser +import time +from pathlib import Path +from pprint import pprint + +import feedparser +import planet +import pytest +from planet.cache import utf8 + +# Ensure the `tests/fixtures/` directory exists and feeds are stored there. +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture(name="channel_cache") +def channel_cache(rss_channel): + try: + yield rss_channel._cache + finally: + pprint(dict(rss_channel._cache)) + + +@pytest.fixture(scope="module", name="rss_feed") +def load_rss_feed(): + """Load and parse the sample RSS feed fixture.""" + with open(FIXTURES_DIR / "sample_rss.xml", encoding="utf-8") as rss_file: + feed_data = rss_file.read() + return feedparser.parse(feed_data) + + +@pytest.fixture(scope="module", name="atom_feed") +def load_atom_feed(): + """Load and parse the sample Atom feed fixture.""" + with open(FIXTURES_DIR / "sample_atom.xml", encoding="utf-8") as atom_file: + feed_data = atom_file.read() + return feedparser.parse(feed_data) + + +def test_newsitem_from_rss(rss_feed, rss_channel): + """Test that we can create a NewsItem from an RSS feed item.""" + item = rss_feed.entries[0] + newsitem = planet.NewsItem(rss_channel, rss_feed.entries[0]["id"]) + newsitem.update(item) + assert newsitem.title == "Example Item 1" + assert newsitem.link == "https://example.com/item1" + assert newsitem.date[0] == 2021 + assert newsitem.author == "author@example.com (John Doe)" + assert newsitem.content == "This is a description of item 1" + assert newsitem.summary == "This is a description of item 1" + + +def test_newsitem_from_atom(atom_feed, atom_channel): + """Test that we can create a NewsItem from an RSS feed item.""" + item = atom_feed.entries[0] + newsitem = planet.NewsItem(atom_channel, atom_feed.entries[0]["id"]) + newsitem.update(item) + assert newsitem.title == "Example Entry 1" + assert newsitem.link == "https://example.com/entry1" + # parse the iso timestamp into a time tuple + assert newsitem.date[0] == 2021 + assert newsitem.content == "This is a summary of entry 1" + assert newsitem.summary == "This is a summary of entry 1" + + +def test_caching_newsitem(rss_feed, rss_channel): + """Test that we can create a NewsItem from an RSS feed item.""" + item = rss_feed.entries[0] + newsitem = planet.NewsItem(rss_channel, rss_feed.entries[0]["id"]) + newsitem.update(item) + newsitem.cache_write() + + # now try read the newsitem, but with the cache; we should be able to + # get the values before updating + newsitem = planet.NewsItem(rss_channel, rss_feed.entries[0]["id"]) + assert newsitem.title == "Example Item 1" + assert newsitem.link == "https://example.com/item1" + assert newsitem.date[0] == 2021 + assert newsitem.author == "author@example.com (John Doe)" + assert newsitem.content == "This is a description of item 1" + assert newsitem.summary == "This is a description of item 1" + + +# These tests are aimed at testing the specifications of the cache; we are looking at key structures +# and internals, so that we can have some sense of implementation consistency. + + +@pytest.fixture(name="news_item") +def news_item( + rss_channel, + rss_feed, +): + return planet.NewsItem(rss_channel, rss_feed.entries[0]["id"]) + + +@pytest.fixture(name="sample_entry") +def sample_entry(rss_feed): + return rss_feed.entries[0] + + +def test_cache_write_and_read(news_item, sample_entry, channel_cache): + # First, update the news item using the sample_entry + news_item.update(sample_entry) + news_item.cache_write(sync=True) + + # Now, inspect the cache to see if keys have been stored correctly + assert f"{news_item.id} title" in channel_cache + assert f"{news_item.id} link" in channel_cache + assert channel_cache[f"{news_item.id} title"] == utf8(sample_entry["title"]) + assert channel_cache[f"{news_item.id} link"] == utf8(sample_entry["link"]) + + # Date value stored as a string representation of the time tuple + assert f"{news_item.id} updated" in channel_cache + assert channel_cache[f"{news_item.id} updated"] == " ".join( + map(str, sample_entry["updated_parsed"]) + ) + + +def test_cache_clear(news_item, sample_entry, channel_cache): + # Update and save to cache + news_item.update(sample_entry) + news_item.cache_write(sync=True) + + # Ensure keys are there + assert f"{news_item.id} title" in channel_cache + + # Now clear the cache for the news_item + news_item.cache_clear(sync=True) + + # Ensure keys are removed from the cache + assert f"{news_item.id} title" not in channel_cache + assert f"{news_item.id} link" not in channel_cache + assert f"{news_item.id} updated" not in channel_cache + + +def test_cache_key_type(news_item, sample_entry, channel_cache): + # Update and save to cache + news_item.update(sample_entry) + news_item.cache_write(sync=True) + + # Ensure keys and types are correct + assert channel_cache[f"{news_item.id} title"] == "Example Item 1" + assert channel_cache[f"{news_item.id} title type"] == "string" + assert channel_cache[f"{news_item.id} updated type"] == "date" + + +def test_cache_reload(news_item, sample_entry, rss_channel): + # Update and save to cache + news_item.update(sample_entry) + news_item.cache_write(sync=True) + + # Create a new NewsItem instance with the same cache, and reload + new_item = planet.NewsItem(rss_channel, f"{news_item.id}") + new_item.cache_read() + + # Check that the data is retrieved as expected + assert new_item.get("title") == "Example Item 1" + assert new_item.get("link") == "https://example.com/item1" + assert new_item.get("date") == sample_entry["date_parsed"] + + +def test_cache_date_field(news_item, sample_entry, rss_channel, channel_cache): + # Ensure the date field gets cached properly + news_item.update(sample_entry) + news_item.cache_write(sync=True) + + # Check that the date type is correctly saved as dates + assert f"{news_item.id} updated" in channel_cache + assert f"{news_item.id} updated type" in channel_cache + assert channel_cache[f"{news_item.id} updated type"] == "date" + + # Reload item and ensure the date value is parsed correctly + new_item = planet.NewsItem(rss_channel, f"{news_item.id}") + new_item.cache_read() + + # Verify that the date field is properly restored as date tuple + assert new_item.get("date") == sample_entry["date_parsed"] + + +def test_delete_key_from_cache(news_item, sample_entry, channel_cache): + # Update and save to cache + news_item.update(sample_entry) + news_item.cache_write(sync=True) + + # Delete 'title' key using NewsItem's del_key method + news_item.del_key("title") + news_item.cache_write(sync=True) + + # Ensure 'title' key is deleted from cache + assert f"{news_item.id} title" not in channel_cache