Commit 87612cc0 authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 3.8

parent be234947
......@@ -24,21 +24,18 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from __future__ import generators
import os, sys, getopt, time, re
import os, sys, getopt, time, re, warnings
from gzip import GzipFile
from types import UnicodeType
from sqlobject import *
from sqlobject.sqlbuilder import ISNOTNULL
from imdb.parser.sql.dbschema import *
from imdb.parser.sql import soundex, get_movie_data
from imdb.parser.sql.dbschema import *
from imdb.utils import analyze_title, analyze_name, \
build_name, build_title, normalizeName, _articles, \
build_company_name, analyze_company_name
from imdb.parser.local.movieParser import _bus, _ldk, _lit, _links_sect
from imdb.parser.local.personParser import _parseBiography
from imdb._exceptions import IMDbParserError
from imdb._exceptions import IMDbParserError, IMDbError
_articles = list(_articles)
for i, art in enumerate(_articles):
......@@ -48,7 +45,7 @@ for i, art in enumerate(_articles):
re_nameImdbIndex = re.compile(r'\(([IVXLCDM]+)\)')
HELP = """imdbpy2sql.py usage:
%s -d /directory/with/PlainTextDataFiles/ -u URI [--COMPATIBILITY-OPTIONS]
%s -d /directory/with/PlainTextDataFiles/ -u URI [-o sqlobject,sqlalchemy] [--COMPATIBILITY-OPTIONS]
# NOTE: URI is something along the line:
scheme://[user[:password]@]host[:port]/database[?parameters]
......@@ -75,6 +72,10 @@ HELP = """imdbpy2sql.py usage:
IMDB_PTDF_DIR = None
# URI used to connect to the database.
URI = None
# ORM to use.
USE_ORM = None
#
DB_TABLES = []
# Max allowed recursion, inserting data.
MAX_RECURSION = 10
# Store custom queries specified on the command line.
......@@ -115,11 +116,11 @@ if '--sqlite-transactions' in sys.argv[1:]:
# Manage arguments list.
try:
optlist, args = getopt.getopt(sys.argv[1:], 'u:d:e:h',
optlist, args = getopt.getopt(sys.argv[1:], 'u:d:e:o:h',
['uri=', 'data=', 'execute=',
'mysql-innodb', 'ms-sqlserver',
'sqlite-transactions',
'mysql-force-myisam',
'mysql-force-myisam', 'orm',
'help'])
except getopt.error, e:
print 'Troubles with arguments.'
......@@ -151,6 +152,8 @@ for opt in optlist:
CUSTOM_QUERIES.setdefault(nw, []).append(cmd)
else:
CUSTOM_QUERIES.setdefault(when, []).append(cmd)
elif opt[0] in ('-o', '--orm'):
USE_ORM = opt[1].split(',')
elif opt[0] in ('-h', '--help'):
print HELP
sys.exit(0)
......@@ -210,8 +213,45 @@ if ('--mysql-force-myisam' in sys.argv[1:] and
"belong to the database server you're using: proceed at your\n"\
"own risk!\n"
if USE_ORM is None:
USE_ORM = ('sqlobject', 'sqlalchemy')
if not isinstance(USE_ORM, (tuple, list)):
USE_ORM = [USE_ORM]
nrMods = len(USE_ORM)
_gotError = False
for idx, mod in enumerate(USE_ORM):
mod = mod.lower()
try:
if mod == 'sqlalchemy':
from imdb.parser.sql.alchemyadapter import getDBTables, \
NotFoundError, setConnection, AND, OR, IN, ISNULL, \
ISNOTNULL, toUTF8
elif mod == 'sqlobject':
from imdb.parser.sql.objectadapter import getDBTables, \
NotFoundError, setConnection, AND, OR, IN, ISNULL, \
ISNOTNULL, toUTF8
else:
warnings.warn('unknown module "%s".' % mod)
continue
DB_TABLES = getDBTables()
for t in DB_TABLES:
globals()[t._imdbpyName] = t
if _gotError:
warnings.warn('falling back to "%s".' % mod)
break
except ImportError, e:
if idx+1 >= nrMods:
raise IMDbError, 'unable to use any ORM in %s: %s' % (
str(USE_ORM), str(e))
else:
warnings.warn('unable to use "%s": %s' % (mod, str(e)))
_gotError = True
continue
else:
raise IMDbError, 'unable to use any ORM in %s' % str(USE_ORM)
# Connect to the database.
conn = setConnection(URI)
conn = setConnection(URI, DB_TABLES)
# Extract exceptions to trap.
OperationalError = conn.module.OperationalError
IntegrityError = conn.module.IntegrityError
......@@ -222,7 +262,7 @@ CURS = connectObject.cursor()
# Name of the database and style of the parameters.
DB_NAME = conn.dbName
PARAM_STYLE = conn.module.paramstyle
PARAM_STYLE = conn.paramstyle
def tableName(table):
......@@ -1876,7 +1916,7 @@ def restoreImdbID(tons, cls):
mop_in_db.imdbID = t['imdbID']
except:
continue
except SQLObjectNotFound:
except NotFoundError:
continue
count += 1
print 'DONE! (restored %d entries out of %d)' % (count, len(tons))
......@@ -1948,7 +1988,7 @@ def run():
# Truncate the current database.
print 'DROPPING current database...',
sys.stdout.flush()
dropTables()
dropTables(DB_TABLES)
print 'DONE!'
executeCustomQueries('BEFORE_CREATE')
......@@ -1956,7 +1996,7 @@ def run():
# Rebuild the database structure.
print 'CREATING new tables...',
sys.stdout.flush()
createTables()
createTables(DB_TABLES)
print 'DONE!'
t('dropping and recreating the database')
......@@ -2058,7 +2098,7 @@ def run():
print 'building database indexes (this may take a while)'
sys.stdout.flush()
# Build database indexes.
createIndexes()
createIndexes(DB_TABLES)
t('createIndexes()')
executeCustomQueries('END')
......
......@@ -87,7 +87,9 @@ def doMPAA():
line = mpaaF.readline()
mpaaF.close()
offsetList.sort()
print 'Creating the mpaa-ratings-reasons.index file...',
idxF = open(os.path.join(LOCAL_DATA_DIR, 'mpaa-ratings-reasons.index'),'wb')
print 'DONE!'
idxF.writelines('%s%s' % (toBin3(movieID), toBin3(ftell))
for movieID, ftell in offsetList)
idxF.close()
......@@ -99,7 +101,9 @@ mpaaFileOut = open(os.path.join(LOCAL_DATA_DIR,
'mpaa-ratings-reasons.data'), 'w')
for aLine in mpaaFileGZ:
print 'Creating the mpaa-ratings-reasons.data file...',
mpaaFileOut.write(aLine)
print 'DONE!'
mpaaFileOut.close()
mpaaFileGZ.close()
......
......@@ -47,8 +47,8 @@ print 'characterID\t: imdbID : name'
# Print the long imdb name for every character.
for character in results:
print '%s\t\t: %s : %s' % (character.characterID,
i.get_imdbID(character),
character['long imdb name'].encode(out_encoding, 'replace'))
outp = u'%s\t\t: %s : %s' % (character.characterID, i.get_imdbID(character),
character['long imdb name'])
print outp.encode(out_encoding, 'replace')
......@@ -47,8 +47,8 @@ print 'companyID\t: imdbID : name'
# Print the long imdb name for every company.
for company in results:
print '%s\t\t: %s : %s' % (company.companyID,
i.get_imdbID(company),
company['long imdb name'].encode(out_encoding, 'replace'))
outp = u'%s\t\t: %s : %s' % (company.companyID, i.get_imdbID(company),
company['long imdb name'])
print outp.encode(out_encoding, 'replace')
......@@ -47,8 +47,8 @@ print 'movieID\t: imdbID : title'
# Print the long imdb title for every movie.
for movie in results:
print '%s\t: %s : %s' % (movie.movieID,
i.get_imdbID(movie),
movie['long imdb title'].encode(out_encoding, 'replace'))
outp = u'%s\t: %s : %s' % (movie.movieID, i.get_imdbID(movie),
movie['long imdb title'])
print outp.encode(out_encoding, 'replace')
......@@ -47,8 +47,8 @@ print 'personID\t: imdbID : name'
# Print the long imdb name for every person.
for person in results:
print '%s\t: %s : %s' % (person.personID,
i.get_imdbID(person),
person['long imdb name'].encode(out_encoding, 'replace'))
outp = u'%s\t: %s : %s' % (person.personID, i.get_imdbID(person),
person['long imdb name'])
print outp.encode(out_encoding, 'replace')
......@@ -13,8 +13,16 @@ I'd like to thank the following people for their help:
* H. Turgut Uyar for a number of bug reports and a lot of work on
the test-suite.
* Rdian06 for a patch for movies without plot authors.
* Jesper Nøhr for a lot of testing, especially on 'sql'.
* James Rubino for many bug reports.
* Alen Ribic for some bug reports and hints.
* Helio MC Pereira for a bug report about unicode.
* Michael Charclo for some bug reports performing 'http' queries.
* Amit Belani for bug reports about plot outline and other changes.
......
Changelog for IMDbPY
====================
* What's the new in release 3.8 "Quattro Carogne a Malopasso" (03 Nov 2008)
[http]
- fixed search system for direct hits.
- fixed IDs so that they always are str and not unicode.
- fixed a bug about plot without authors.
- for pages about a single episode of a series, "Series Crew" are
now separated items.
- introduced the preprocess_dom method of the DOMParserBase class.
- handling rowspan for DOMHTMLAwardsParser is no more a special case.
- first changes to remove old parsers.
[sql]
- introduced support for SQLAlchemy.
[mobile]
- fixed multiple 'nick names'.
- added 'aspect ratio'.
- fixed a "direct hit" bug searching for people.
[global]
- fixed search_* example scripts.
- updated the documentation.
* What's the new in release 3.7 "Burn After Reading" (22 Sep 2008)
[http]
- introduced a new set of parsers, active by default, based on DOM/XPath.
......
......@@ -2,6 +2,14 @@ IMDbPY
NOTE: see also the recommendations in the "DISCLAIMER.txt" file.
NOTE: for a list of other persons who share with me the copyright over
specific portions of code, see the "CONTRIBUTORS.txt" file.
NOTE: IMDbPY includes an unmodified version of BeautifulSoup,
renamed _bsoup.py; that code is copyrighted by its author,
Leonard Richardson <leonardr at segfault.org> and is released
under a New-style BSD license.
Copyright 2004-2008 Davide Alberani <da@erlug.linux.it>
......
......@@ -54,6 +54,8 @@ imdb (package)
+-> sql (package)
| |
| +-> dbschema
| +-> alchemyadapter
| +-> objectadapter
|
+-> common (package)
|
......@@ -108,7 +110,9 @@ of the database.
The parser.sql package manages the access to the data in the SQL
database, created with the imdbpy2sql.py script; see the README.sqldb file.
The dbschema module contains tables definitions and some useful functions.
The dbschema module contains tables definitions and some useful functions;
The alchemyadapter adapts the SQLAlchemy ORM to the internal mechanisms
of IMDbPY, and the objectadapter does the same for the SQLObject ORM.
The class in the parser.mobile package is a subclass of the one found
in parser.http, with some method overridden to be many times faster (from
......
......@@ -28,9 +28,9 @@ Preventing the installation of the parser.local package will also save
at least 120Kb of disk space on your system.
The DO_SQL variable, if set to 0, will excludes the parser.sql
package; you don't need it if your system does not have the
SQLObject module and you don't want to store the whole IMDb's plain
text database files in a SQL database.
package; you don't need it if your system does not have any of the
SQLObject or SQLAlchemy packages and/or you don't want to store the
whole IMDb's plain text database files in a SQL database.
If both DO_LOCAL and DO_SQL are set to 0, the parser.common package
is not installed, and the "cutils" C module is not compiled.
......
......@@ -7,7 +7,7 @@ were finite-states machines, being SGMLParser a SAX parser) to
a set of parsers based on the libxml2 library or on the BeautifulSoup
module (and so, using a DOM/XPath-based approach).
The idea and the implementation of these new parsers is mostly a
work of H. Turgut Uyar, and can brings to parsers shorter, easier
work of H. Turgut Uyar, and can bring to parsers shorter, easier
to write and maybe even faster.
......@@ -67,6 +67,18 @@ parameter to True. E.g.:
ia = IMDb('http', oldParsers=True)
...
The path to remove the old parsers is set; right now they'll
remain until some major parser will not be definitely outdated.
Secondary parsers will be removed as they'll become useless; by
default they will return an empty result (issuing a warning), but
they can be replaced with a call to the corresponding new parser,
if you want: all you need to do is to set to True the "fallBackToNew"
parameter.
E.g.:
from imdb import IMDb
ia = IMDb('http', oldParsers=True, fallBackToNew=True)
...
FORCING LXML OR BEAUTIFULSOUP
=============================
......
......@@ -63,7 +63,8 @@ you can access movie data through the e-mail interface, etc. etc.
---------------------------+-----------+------------------------------------
'sql' | 'db', | information are fetched through
| 'database'| a SQL database (every database
| | supported by SQLObject is available).
| | supported by SQLObject and SQLAlchemy
| | is available).
---------------------------+-----------+------------------------------------
'mobile' | | same as 'httpThin', but string
| | methods are used for parsing.
......
NOTE: the imdbpy2sql.py script, used to populate a database using
the data in the IMDb's plain text data files, is a critical piece
of IMDbPY: it's based on the SQLObject ORM to be database-independent
and contains a lot of tricks to be as fast as possible; however there
of IMDbPY: it's based on an ORM to be database-independent and
contains a lot of tricks to be as fast as possible; however there
are huge margins for improvements; if you want to help, please read the
TODO.txt file and subscribe the imdbpy-devel mailing list at:
http://imdbpy.sf.net/?page=help#ml
......@@ -23,6 +23,18 @@ This means that MySQL, PostgreSQL, SQLite, Firebird, MAX DB,
Sybase and MSSQL are supported and, as your read this text,
maybe other database backends were added.
Since release 3.8, SQLAlchemy (version 0.4 and 0.5) is also supported
(this adds at least DB2/Informix IDS to the list of supported databases).
REQUIREMENTS
============
You need one of SQLObject or SQLAlchemy (both can be installed
safely: by default IMDbPY first tries SQLObject; if not present
it fall-backs to SQLAlchemy).
[SQLObject]
You need the SQLObject package, at least version 0.8; even better
if you can download the latest SVN snapshot.
......@@ -30,12 +42,23 @@ SQLObject home page: http://sqlobject.org/
SVN command to download the latest development version:
svn co http://svn.colorstudy.com/SQLObject/trunk SQLObject
Obviously the SQLObject can access databases only through other
[SQLAlchemy]
Support for SQLAlchemy is still in beta (please report any bug!)
and a bit slower than SQLObject; anyway, you need version 0.4 or 0.5.
SQLAlchemy home page: http://www.sqlalchemy.org/
SVN command to download the latest development version:
svn checkout http://svn.sqlalchemy.org/sqlalchemy/trunk sqlalchemy
[OTHER REQUIRED MODULES]
Obviously SQLObject and SQLAlchemy can access databases only through other
specific modules/packages, that you need to have installed (e.g.:
python-mysqldb for MySQL, python-psycopg for PostgreSQL, python-sqlite
for SQLite and so on).
SQL DATABASE INSTALLATION
=========================
......@@ -77,7 +100,10 @@ Some examples:
sqlite:/C|/full/path/to/database
sqlite:/:memory:
For other information you can read the SQLObject documentation.
For other information you can read the SQLObject/SQLAlchemy documentation.
You can force the use of SQLObject or SQLAlchemy with the '-o' command
line option (i.e.: "-o sqlobject" or "-o sqlalchemy" or a list of comma
separated values to specify an order of preference).
TIMING
......@@ -90,6 +116,8 @@ complete on my test system (read below).
A lot of memory (RAM or swap space) is required, in the range of
at least 250/500 megabytes (plus more for the database server).
In the end, the database will require between 2.5GB and 5GB of disc space.
The should be no difference - at insert time - between SQLObject and
SQLAlchemy.
As said, the performances varies greatly using a database server or another:
MySQL, for instance, has an executemany() method of the cursor object
......@@ -222,6 +250,16 @@ Now you can use IMDbPY with the database:
and so on...
The 'sql' data access system takes an optional argument, named "useORM",
which can be set to a string or a list of values (the string can be
a comma-separated list of items, to denote an order of preference).
Valid values are "sqlobject" and "sqlalchemy".
The default is ('sqlobject', 'sqlalchemy').
E.g.:
i = IMDb('sql', uri='YOUR_URI_STRING', useORM='sqlalchemy,sqlobject')
i = IMDb('sql', uri='YOUR_URI_STRING', useORM=['sqlalchemy', 'sqlobject'])
i = IMDb('sql', uri='YOUR_URI_STRING', useORM='sqlalchemy'])
ADVANCED FEATURES
=================
......
......@@ -72,12 +72,12 @@ mailing list: http://imdbpy.sf.net/?page=help#ml
If you plan to package IMDbPY for your distribution/operating system,
keep in mind that, while IMDbPY can works out-of-the-box, some external
package may be required for certain functionality:
- SQLObject: it's REQUIRED if you want to use the 'sql' data access
system.
- SQLObject or SQLAlchemy: one of these is REQUIRED if you want to use
the 'sql' data access system.
- python-lxml: the 'http' data access system will be much faster, if
it's installed.
it's installed.
Both should probably be "suggested" dependencies.
All of them should probably be "suggested" dependencies.
RECENT IMPORTANT CHANGES
......@@ -96,6 +96,9 @@ Since release 3.7, IMDbPY has moved its main parsers from a SAX-based
approach to a DOM/XPath-based one; see the README.newparsers file
for more information.
Since release 3.8, IMDbPY supports both SQLObject and SQLAlchemy; see
README.sqldb for more information.
FEATURES
========
......@@ -149,7 +152,7 @@ suitable for systems with limited bandwidth but normal CPU power.
======================================
* Returns every information available in the plain text data files.
* Every database supported by SQLObject is available.
* Every database supported by SQLObject and SQLAlchemy is available.
FEATURES OF THE MOBILE DATA ACCESS SYSTEM
......
......@@ -152,5 +152,6 @@ Things to do:
IMDb releases also "diff" files to keep the plain text files updated;
it would be wonderful to directly use these diff files to upgrade the
SQL database, but I think this is a nearly impossible task.
* replace SQLObject with SQLAlchemy?
* There are a lot of things to do to improve SQLAlchemy support (especially
in terms of performances); see FIXME/TODO/XXX notices in the code.
......@@ -30,20 +30,36 @@
[imdbpy]
# Default.
accessSystem = http
# Optional (options common to every data access system):
#adultSearch = on
#results = 20
# Optional (options common to http and mobile data access systems):
#proxy = http://localhost:8080/
#cookie_id = string_representing_the_cookie_id
#cookie_uu = string_representing_the_cookie_uu
# Parameters for the 'http' data access system.
# Use the old set of parsers (deprecated: they will go away).
#oldParsers = False
# If True, when an old parser is no more available, falls back to a call
# to a new parser (default, False: return an empty result).
#fallBackToNew = False
# (deprecated: they will go away).
# Parser to use; can be a single value or a list of value separated by
# a comma, to express order preference. Valid values: "lxml", "beautifulsoup"
# useModule = lxml,beautifulsoup
# Parameters for the 'mobile' data access system.
#accessSystem = mobile
# Parameters for the 'sql' data access system.
#accessSystem = sql
#uri = mysql://user:password@localhost/imdb
# ORM to use; can be a single value or a list of value separated by
# a comma, to express order preference. Valid values: "sqlobject", "sqlalchemy"
#useORM = sqlobject,sqlalchemy
# Parameters for the 'local' data access system.
#accessSystem = local
......
......@@ -338,7 +338,7 @@ class Movie(_Container):
s += u'Rating: %s' % rating
nr_votes = self.get('votes')
if nr_votes:
s += u'(%s votes)' % nr_votes
s += u' (%s votes)' % nr_votes
s += u'.\n'
plot = self.get('plot')
if plot:
......
......@@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems']
__version__ = VERSION = '3.7'
__version__ = VERSION = '3.8'
# Import compatibility module (importing it is enough).
import _compat
......
......@@ -26,6 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import sys
import warnings
from urllib import FancyURLopener, quote_plus
from codecs import lookup
......@@ -43,14 +44,24 @@ import characterParser
import companyParser
class _FakeParser(object):
"""Fake parser to replace obsoleted old parsers."""
def __init__(self, *args, **kwds):
pass
def parse(self, *args, **kwds):
return {'data': {}}
class _ModuleProxy:
"""A proxy to instantiate and access parsers."""
def __init__(self, module, defaultKeys=None, oldParsers=False,
useModule=None):
useModule=None, fallBackToNew=False):
"""Initialize a proxy for the given module; defaultKeys, if set,
muste be a dictionary of values to set for instanced objects."""
self.oldParsers = oldParsers
self.useModule = useModule
self.fallBackToNew = fallBackToNew
if defaultKeys is None:
defaultKeys = {}
self._defaultKeys = defaultKeys
......@@ -66,7 +77,17 @@ class _ModuleProxy:
kwds = {}
if not self.oldParsers and self.useModule:
kwds = {'useModule': self.useModule}
obj = _entry[0][self.oldParsers](**kwds)
parserClass = _entry[0][self.oldParsers]
# Warns when the old parser was requested, but it's gone.
if self.oldParsers and parserClass is _entry[0][0]:
if not self.fallBackToNew:
warnings.warn('Old parser "%s" not available; these ' \
'data will not be available.' % name)
parserClass = _FakeParser
else:
warnings.warn('Old parser "%s" not available; falling ' \
'back to the new one.' % name)
obj = parserClass(**kwds)
attrsToSet = self._defaultKeys.copy()
attrsToSet.update(_entry[1] or {})
# Set attribute to the object.
......@@ -179,7 +200,6 @@ class IMDbURLopener(FancyURLopener):
if encode is None:
encode = 'latin_1'
# The detection of the encoding is error prone...
import warnings
warnings.warn('Unable to detect the encoding of the retrieved '
'page [%s]; falling back to default latin1.' % encode)
return unicode(content, encode, 'replace')
......@@ -211,8 +231,8 @@ class IMDbHTTPAccessSystem(IMDbBase):
accessSystem = 'http'
def __init__(self, isThin=0, adultSearch=1, proxy=-1, oldParsers=False,
useModule=None, cookie_id=-1, cookie_uu=None,
*arguments, **keywords):
fallBackToNew=False, useModule=None, cookie_id=-1,
cookie_uu=None, *arguments, **keywords):
"""Initialize the access system."""
IMDbBase.__init__(self, *arguments, **keywords)
self.urlOpener = IMDbURLopener()
......@@ -238,24 +258,35 @@ class IMDbHTTPAccessSystem(IMDbBase):
self.set_cookies(cookie_id, cookie_uu)
if proxy != -1:
self.set_proxy(proxy)
if useModule is not None:
if not isinstance(useModule, (list, tuple)) and ',' in useModule:
useModule = useModule.split(',')
_def = {'_modFunct': self._defModFunct, '_as': self.accessSystem}
# Proxy objects.
self.smProxy = _ModuleProxy(searchMovieParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.spProxy = _ModuleProxy(searchPersonParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.scProxy = _ModuleProxy(searchCharacterParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.scompProxy = _ModuleProxy(searchCompanyParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.mProxy = _ModuleProxy(movieParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.pProxy = _ModuleProxy(personParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.cProxy = _ModuleProxy(characterParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.compProxy = _ModuleProxy(companyParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule)
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
def _normalize_movieID(self, movieID):
"""Normalize the given movieID."""
......
......@@ -40,23 +40,29 @@ def tostring(element):
return str(element)
def fix_rowspans(html_string):
"""Repeat td elements according to their rowspan attributes in subsequent
tr elements.
"""
dom = fromstring(html_string)
cols = dom.findAll('td', rowspan=True)
for col in cols:
span = int(col.get('rowspan'))
position = len(col.findPreviousSiblings('td'))
row = col.parent
next = row
for i in xrange(span-1):
next = next.findNextSibling('tr')
# if not cloned, child will be moved to new parent
clone = fromstring(tostring(col)).td
next.insert(position, clone)
return tostring(dom)
def getattribute(node, attrName):
"""Return an attribute value or None."""
return node.get(attrName)
def setattribute(node, attrName, attrValue):
"""Set an attribute to a given value."""
if attrValue is None:
del node[attrName]
else:
node[attrName] = attrValue
def getparent(node):
"""Return the parent of the given node."""
return node.parent
def clone(node):
"""Return a clone of the given node."""
# XXX: test with deepcopy? Check if there are problems with
# python 2.4 and previous.
return fromstring(tostring(node)).findChild(True)
def apply_xpath(node, path):
......
......@@ -37,23 +37,29 @@ def tostring(element):
return html.tostring(element, encoding=unicode)
def fix_rowspans(html_string):
"""Repeat td elements according to their rowspan attributes in subsequent
tr elements.
"""
dom = fromstring(html_string)
cols = dom.xpath("//td[@rowspan]")
for col in cols:
span = int(col.get('rowspan'))
position = len(col.xpath("./preceding-sibling::td"))
row = col.getparent()
next = row
for i in xrange(span-1):