Upgrading to GitLab 11.10.0. Expect errors and see debian-infrastructure-announce@lists.debian.org for further information.

Commit 90a7ba5b authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 3.0

parent efc87ed9
......@@ -5,7 +5,7 @@ imdbpy2sql.py script.
This script puts the data of the plain text data files into a
SQL database.
Copyright 2005-2006 Davide Alberani <da@erlug.linux.it>
Copyright 2005-2007 Davide Alberani <da@erlug.linux.it>
2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it>
This program is free software; you can redistribute it and/or modify
......@@ -38,6 +38,11 @@ from imdb.parser.local.movieParser import _bus, _ldk, _lit, _links_sect
from imdb.parser.local.personParser import _parseBiography
from imdb._exceptions import IMDbParserError
_articles = list(_articles)
for i, art in enumerate(_articles):
if not isinstance(art, unicode): continue
_articles[i] = art.encode('utf_8')
re_nameImdbIndex = re.compile(r'\(([IVXLCDM]+)\)')
HELP = """imdbpy2sql.py usage:
......@@ -200,9 +205,9 @@ def title_soundex(title):
article is pruned. It assumes to receive a title without year/imdbIndex
or kind indications, but just the title string, as the one in the
analyze_title(title)['title'] value."""
title = unicode(title, 'utf_8')
##if not isinstance(title, unicode): title = unicode(title, 'utf_8')
# Prune non-ascii chars from the string.
title = title.encode('ascii', 'replace')
##title = title.encode('ascii', 'replace')
if not title: return None
ts = title.split(', ')
# Strip the ending article, if any.
......@@ -219,9 +224,9 @@ def name_soundexes(name):
from the first one.
The third is the soundex of the surname, if different from the
other two values."""
name = unicode(name, 'utf_8')
##if not isinstance(name, unicode): name = unicode(name, 'utf_8')
# Prune non-ascii chars from the string.
name = name.encode('ascii', 'ignore')
##name = name.encode('ascii', 'ignore')
if not name: return (None, None, None)
s1 = soundex(name)
name_normal = normalizeName(name)
......@@ -432,9 +437,9 @@ class _BaseCache(dict):
self._tmpDict.clear()
except OperationalError, e:
# Dataset too large; split it in two and retry.
## new code!
## the same class instance (self) is used, instead of
## creating two separated objects.
# XXX: new code!
# the same class instance (self) is used, instead of
# creating two separated objects.
_recursionLevel += 1
self._flushing = 0
firstHalf = {}
......@@ -503,11 +508,11 @@ class MoviesCache(_BaseCache):
def __init__(self, *args, **kwds):
_BaseCache.__init__(self, *args, **kwds)
self.episodesYear = {}
self.movieYear = {}
self.sqlstr, self.converter = createSQLstr(Title, ('id', 'title',
'imdbIndex', 'kindID', 'productionYear',
'phoneticCode', 'episodeOfID',
'seasonNr', 'episodeNr'))
'seasonNr', 'episodeNr', 'seriesYears'))
def populate(self):
print ' * POPULATING %s...' % self.className
......@@ -520,9 +525,9 @@ class MoviesCache(_BaseCache):
episodeofidCol = colName(Title, 'episodeOfID')
seasonNrCol = colName(Title, 'seasonNr')
episodeNrCol = colName(Title, 'episodeNr')
sqlPop = 'SELECT %s, %s, %s, %s, %s, %s, %s, %s FROM %s;' % (movieidCol,
titleCol, kindidCol, yearCol, imdbindexCol,
episodeofidCol, seasonNrCol, episodeNrCol, titleTbl)
sqlPop = 'SELECT %s, %s, %s, %s, %s, %s, %s, %s FROM %s;' % \
(movieidCol, titleCol, kindidCol, yearCol, imdbindexCol,
episodeofidCol, seasonNrCol, episodeNrCol, titleTbl)
CURS.execute(sqlPop)
_oldcacheValues = Title.sqlmeta.cacheValues
Title.sqlmeta.cacheValues = False
......@@ -556,7 +561,7 @@ class MoviesCache(_BaseCache):
tmpDictiter = self._tmpDict.iteritems
for k, v in tmpDictiter():
try:
t = analyze_title(k)
t = analyze_title(k, _emptyString='')
except IMDbParserError:
if k and k.strip():
print 'WARNING %s._toDB() invalid title:' % self.className,
......@@ -570,15 +575,17 @@ class MoviesCache(_BaseCache):
stitle = build_title(tget('episode of'), canonical=1)
episodeOf = self.addUnique(stitle)
del t['episode of']
year = self.episodesYear.get(v)
year = self.movieYear.get(v)
if year is not None:
try: t['year'] = int(year)
except ValueError: pass
elif kind in ('tv series', 'tv mini series'):
t['series years'] = self.movieYear.get(v)
title = tget('title')
soundex = title_soundex(title)
lapp((v, title, tget('imdbIndex'), KIND_IDS[kind],
tget('year'), soundex, episodeOf,
tget('season'), tget('episode')))
tget('season'), tget('episode'), tget('series years')))
self._runCommand(l)
def _runCommand(self, dataList):
......@@ -777,16 +784,16 @@ def readMovieList():
line_d = unpack(line, ('title', 'year'))
title = line_d['title']
yearData = None
# Collect 'year' column for tv series' episodes.
if title[-1:] == '}':
yearData = [('episodesYear', line_d['year'])]
# Collect 'year' column for tv "series years" and episodes' year.
if title[0] == '"':
yearData = [('movieYear', line_d['year'])]
mid = CACHE_MID.addUnique(title, yearData)
if count % 10000 == 0:
print 'SCANNING movies:', _(title),
print '(movieID: %s)' % mid
count += 1
CACHE_MID.flush()
CACHE_MID.episodesYear.clear()
CACHE_MID.movieYear.clear()
mdbf.close()
......@@ -816,8 +823,17 @@ def doCast(fp, roleid, rolename):
if not item: continue
if item[0] == '[':
role = item[1:-1]
if role[-1:] == ')':
nidx = role.find('(')
if nidx != -1:
note = role[nidx:]
role = role[:nidx].rstrip()
if not role: role = None
elif item[0] == '(':
note = item
if note is None:
note = item
else:
note = '%s %s' % (note, item)
elif item[0] == '<':
textor = item[1:-1]
try:
......@@ -923,7 +939,7 @@ class AkasMoviesCache(MoviesCache):
# id of the referred title.
original_title_id = self.ids.get(the_id)
new_item = [the_id, original_title_id]
new_item += item[1:]
new_item += item[1:-1]
new_item.append(self.notes.get(the_id))
new_dataListapp(tuple(new_item))
new_dataList.reverse()
......@@ -1329,7 +1345,7 @@ def getRating():
def getTopBottomRating():
"""Movie's rating, scanning for top 250 and bottom 100."""
"""Movie's rating, scanning for top 250 and bottom 10."""
for what in ('top 250 rank', 'bottom 10 rank'):
if what == 'top 250 rank': st = RAT_TOP250_START
else: st = RAT_BOT10_START
......@@ -1449,43 +1465,60 @@ def run():
sys.stdout.flush()
dropTables()
print 'done!'
# Rebuild the database structure.
print 'CREATING new tables...',
sys.stdout.flush()
createTables()
# Read the constants.
readConstants()
print 'done!'
t('dropping and recreating the database')
# Populate the CACHE_MID instance.
readMovieList()
# Comment readMovieList() and uncomment the following two lines
# to keep the current info in the name and title tables.
##CACHE_MID.populate()
##CACHE_PID.populate()
t('readMovieList()')
# actors, actresses, directors, ....
# actors, actresses, producers, writers, cinematographers, composers,
# costume-designers, directors, editors, miscellaneous,
# production-designers.
castLists()
# Aka names and titles.
doAkaNames()
t('doAkaNames()')
doAkaTitles()
t('doAkaTitles()')
# alternate-versions, goofs, crazy-credits, quotes, soundtracks, trivia.
doMinusHashFiles()
t('doMinusHashFiles()')
# biographies, business, laserdisc, literature, mpaa-ratings-reasons, plot.
doNMMVFiles()
# certificates, color-info, countries, distributors, genres, keywords,
# language, locations, miscellaneous-companies, production-companies,
# running-times, sound-mix, special-effects-companies, technical,
# release-dates.
doMiscMovieInfo()
# movie-links.
doMovieLinks()
t('doMovieLinks()')
# ratings.
getRating()
t('getRating()')
# taglines.
getTaglines()
t('getTaglines()')
# ratings (top 250 and bottom 10 movies).
getTopBottomRating()
t('getTopBottomRating()')
# complete-cast, complete-crew.
completeCast()
t('completeCast()')
......@@ -1501,6 +1534,7 @@ def run():
print 'building database indexes (this may take a while)'
sys.stdout.flush()
# Build database indexes.
createIndexes()
t('createIndexes()')
......
......@@ -5,10 +5,17 @@ who share the copyright on some portions of the code.
I'd like to thank the following people for their help:
* Ana Guerrero, for maintaining the official debian package.
* Tero Saarni, for the series 60 GUI and a lot of testing and
debugging.
* Ana Guerrero, for the official debian package.
* H. Turgut Uyar for a number of bug reports and a lot of work on
the test-suite.
* Jesper Nøhr for a lot of testing, especially on the 'sql'.
* Ivan Kedrin for a bug report about the analyze_title function.
* Hadley Rich for reporting bugs and providing patches for troubles
parsing tv series' episodes and searching for tv series' titles.
......@@ -35,7 +42,7 @@ I'd like to thank the following people for their help:
* none none (dclist at gmail.com) for a useful hint and code to
retrieve a movie/person object, given an URL.
* Sebastian Pölsterl, for a bug report about the cover url for
* Sebastian Pölsterl, for a bug report about the cover url for
tv (mini) series, and another one about search_* methods.
* Martin Kirst for many hints and the work on the imdbpyweb program.
......
Changelog for IMDbPY
====================
* What's the new in release 3.0 "Spider-Man 3" (03 May 2007)
[global]
- IMDbPY now works with the new IMDb's site design; a new account is
used to access data; this affect a lot of code, especially in the
'http', 'httpThin' and 'mobile' data access systems.
- every returned string should now be unicode; dictionary keywords are
_not_ guaranteed to be unicode (but they are always 7bit strings).
- fixed a bug in the __contains__ method of the Movie class.
- fix in the analyze_title() function to handle malformed episode
numbers.
[http]
- introduced the _in_content instance variable for objects instances of
ParserBase, True when inside the <div id="tn15content"> tag.
Opening and closing this pair of tags two methods, named _begin_content()
and _end_content() are called with no parameters (by default, they do
nothing).
- in the utils module there's the build_person function, useful to create
a Person instance from the tipical formats found in the IMDb's web site.
- an analogue build_movie function can be used to instance Movie objects.
- inverted the getRefs default - now if not otherwise set, it's False.
- added a parser for the "merchandising" ("for sale") page for persons.
- the 'rating' parser now collects also 'rating' and 'votes' data.
- the HTMLMovieParser class (for movies) was rewritten from zero.
- the HTMLMaindetailsParser class (for persons) was rewritten from zero.
- unified the "episode list" and "episodes cast" parsers.
- fixed a bug parsing locations, which resulted in missing information.
- locations_parser splitted from "tech" parser.
- "connections" parser now handles the recently introduced notes.
[http parser conversion]
- these parsers worked out-of-the-box; airing, eprating, alternateversions,
dvd, goofs, keywords, movie_awards, movie_faqs, person_awards, rec,
releasedates, search_movie, search_person, soundclips, soundtrack, trivia,
videoclips.
- these parsers were fixed; amazonrev, connections, episodes, crazycredits,
externalrev, misclinks, newsgrouprev, news, officialsites, otherworks,
photosites, plot, quotes, ratings, sales, taglines, tech, business,
literature, publicity, trivia, videoclips, maindetails, movie.
[mobile]
- fixed to work with the new design.
- a lot of code is now shared amongst 'http' and 'mobile'.
[sql]
- fixes for other bugs related to unicode support.
- minor changes to slightly improve performances.
* What's the new in release 2.9 "Rodan! The Flying Monster" (21 Feb 2007)
[global]
- on 19 February IMDb has redesigned its site; this is the last
......
......@@ -3,7 +3,7 @@ IMDbPY
NOTE: see also the recommendations in the "DISCLAIMER.txt" file.
Copyright 2004-2006 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2007 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......
......@@ -230,6 +230,9 @@ certificates; string list; ['UK:15', 'USA:R']
mpaa; string; the mpaa rating
episodes (series only); dictionary of dictionary; one key for every season,
one key for every episode in the season.
number of episodes (series only); int; total number of episodes.
number of seasons (series only); int; total number of seasons.
series years (series only); string; range of years when the series was produced.
episode of (episode only); Movie object; the parent series for an episode.
season (episode only); int; the season number.
episode (episode only); int; the number of the episode in the season.
......
......@@ -108,6 +108,25 @@ data set; e.g.:
i.update(m, 'full credits')
print len(m['cast']) # yup! More than 300 persons!
If you prefer, you can retrieve the complete cast of every episode,
keeping the lists separated for every episode; instead of retrieving
the list of episodes with:
i.update(m, 'episodes')
use instead:
i.update('episodes cast')
or the equivalent:
i.update(m, 'guests')
Now you end up having the same information as if you have updated
the 'episodes' info set, but every Movie object inside the dictionary
of dictionary has the complete cast.
E.g.:
cast = m['episodes'][1][2]['cast'] # cast list for the second episode
# of the first season.
Beware that both 'episodes cast' and 'guests' will update the
keyword 'episodes' (and not 'episodes cast' or 'guests').
RATINGS
=======
......
......@@ -44,12 +44,12 @@ Andrei Rublyov (1969)
Passion de Jeanne d'Arc, La (1928)
Passion of Darkly Noon, The (1996)
Passion of Ayn Rand, The (1999)
Passion Béatrice, La (1987)
Passion Béatrice, La (1987)
Passion, En (1969)
Pride and the Passion, The (1957)
"Charles II: The Power & the Passion" (2003) (mini)
Pasión de María Elena, La (2003)
Pasión según Berenice, La (1976)
Pasión de María Elena, La (2003)
Pasión según Berenice, La (1976)
Passion of Rita Camilleri, The (1993)
Culture, Water, Money: The Passion of the Frontier (1998)
Sanguisuga conduce la danza, La (1975)
......
......@@ -3,6 +3,9 @@
Starting with release 2.4, IMDbPY internally manages (almost) every string
using unicode, with UTF-8 encoding.
Since release 3.0, every string containing some sort of information is
guarantee to be unicode (notable exceptions are dictionary keywords and
movieID/personID, where they are stored as strings).
The good: we can correctly manage "foreign" names, titles and other
information.
......@@ -10,8 +13,8 @@ The good: we can correctly manage "foreign" names, titles and other
about the original charset.
Without knowing the charset, how can you know that the bytecode
string 'Lina Wertm\xfcller' is west-European iso-8859-1 (and so
it's "Lina Wertmüller") and not Cyrillic KOI-8-R (resulting
in "Lina WertmЭller")?
it's "Lina Wertmüller" - if you're reading this file as UTF-8)
and not Cyrillic KOI-8-R (resulting in "Lina WertmЭller")?
Using unicode, you can store every human language, and show/print
every char correctly, provided that your local charset (and font)
is right.
......@@ -26,8 +29,8 @@ The bad: in primis, performances will suffer: IMDbPY does _a lot_ (and
before they can be printed on screen or on files.
The ugly: converting to unicode a program so huge, born without unicode
support from start, is prone to errors, bugs, self-combustion
and eternal damnation!
support from start, is prone to errors, bugs, spontaneous
combustion and eternal damnation!
You can't mix bytecode strings (with unknown charset) and unicode
with impunity: an exception will be raised because python
doesn't know the encoding of the bytecode string, that must be
......@@ -120,9 +123,10 @@ Whenever you're outputting information about movies or persons,
convert these unicode string to bytecode strings using the encoding
of your output channel (terminal, net, web pages, ...)
Remember: "u = unicode(inputEncoding)" to convert your input to unicode,
Remember: "u = unicode(string, inputEncoding)" convert your input
string to unicode,
"s = u.encode(outputEncoding, manageErrors)" to convert unicode
"s = u.encode(outputEncoding, manageErrors)" convert unicode
strings to your local environment.
......
......@@ -85,6 +85,9 @@ NOTE: it's always time to clean the code! <g>
[local data access system]
* There's a bug with release dates: it seems that for some (many?)
movies, the wrong index to release-dates.data is returned; I'm
wondering if this is a bug of IMDbPY or a bug in mkdb.
* There's probably a bug converting the rating to a float;
see: ER (1994) (TV), but I suspect this to be a mkdb bug.
* The 'votes' key is not correctly stored for very high values;
......
......@@ -4,7 +4,7 @@ Movie module (imdb package).
This module provides the Movie class, used to store information about
a given movie.
Copyright 2004-2006 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2007 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -21,7 +21,7 @@ along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from types import UnicodeType, ListType, TupleType
from types import UnicodeType, ListType, TupleType, DictType
from copy import deepcopy
from imdb.utils import analyze_title, build_title, normalizeTitle, \
......@@ -47,6 +47,7 @@ class Movie(_Container):
'plot summary': 'plot',
'plot summaries': 'plot',
'directed by': 'director',
'created by': 'creator',
'writing credits': 'writer',
'produced by': 'producer',
'original music by': 'composer',
......@@ -77,6 +78,7 @@ class Movie(_Container):
'assistant director',
'second unit director': 'assistant director',
'sound department': 'sound crew',
'costume and wardrobe department': 'costume department',
'special effects by': 'special effects',
'visual effects by': 'visual effects',
'stunts': 'stunt performer',
......@@ -90,9 +92,12 @@ class Movie(_Container):
'aka': 'akas',
'also known as': 'akas',
'country': 'countries',
'genre': 'genres',
'runtime': 'runtimes',
'lang': 'languages',
'color': 'color info',
'cover': 'cover url',
'seasons': 'number of seasons',
'language': 'languages',
'certificate': 'certificates',
'certifications': 'certificates',
......@@ -146,12 +151,12 @@ class Movie(_Container):
if title and not self.data.has_key('title'):
self.set_title(title)
self.movieID = kwds.get('movieID', None)
self.myTitle = kwds.get('myTitle', '')
self.myTitle = kwds.get('myTitle', u'')
def _reset(self):
"""Reset the Movie object."""
self.movieID = None
self.myTitle = ''
self.myTitle = u''
def set_title(self, title):
"""Set the title of the movie."""
......@@ -224,13 +229,15 @@ class Movie(_Container):
self.movieID is not None and self.movieID == other.movieID:
return 1
return 0
isSameMovie = isSameTitle # XXX: just for backward compatiblity.
def __contains__(self, item):
"""Return true if the given Person object is listed in this Movie."""
from Person import Person
if not isinstance(item, Person):
return 0
for p in flatten(self.data, yieldDictKeys=1, scalar=Person):
for p in flatten(self.data, yieldDictKeys=1, scalar=Person,
toDescend=(ListType, DictType, TupleType, Movie)):
if item.isSame(p):
return 1
return 0
......
......@@ -4,7 +4,7 @@ Person module (imdb package).
This module provides the Person class, used to store information about
a given person.
Copyright 2004-2006 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2007 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -68,7 +68,10 @@ class Person(_Container):
'where are they now': 'where now',
'personal quotes': 'quotes',
'mini-biography author': 'imdb mini-biography by',
'biography author': 'imdb mini-biography by'}
'biography author': 'imdb mini-biography by',
'genre': 'genres',
'keys': 'keywords',
'keyword': 'keywords'}
# 'nick names'???
keys_tomodify_list = ('mini biography', 'spouse', 'quotes', 'other works',
......@@ -101,13 +104,13 @@ class Person(_Container):
if name and not self.data.has_key('name'):
self.set_name(name)
self.personID = kwds.get('personID', None)
self.myName = kwds.get('myName', '')
self.myName = kwds.get('myName', u'')
self.billingPos = kwds.get('billingPos', None)
def _reset(self):
"""Reset the Person object."""
self.personID = None
self.myName = ''
self.myName = u''
self.billingPos = None
def _clear(self):
......@@ -175,7 +178,7 @@ class Person(_Container):
self.personID and self.personID == other.personID:
return 1
return 0
isSamePerson = isSameName # XXX: just for compatibility.
isSamePerson = isSameName # XXX: just for backward compatiblity.
def __deepcopy__(self, memo):
"""Return a deep copy of a Person instance."""
......
......@@ -6,7 +6,7 @@ a person from the IMDb database.
It can fetch data through different media (e.g.: the IMDb web pages,
a local installation, a SQL database, etc.)
Copyright 2004-2006 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2007 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -274,9 +274,9 @@ class IMDbBase:
raise IMDbDataAccessError, \
'the supplied object has null movieID or personID'
if mop.accessSystem == self.accessSystem:
as = self
aSystem = self
else:
as = IMDb(mop.accessSystem)
aSystem = IMDb(mop.accessSystem)
if info is None:
info = mop.default_info
elif info == 'all':
......@@ -290,7 +290,7 @@ class IMDbBase:
for i in info:
if i in mop.current_info and not override: continue
try:
method = getattr(as, 'get_%s_%s' %
method = getattr(aSystem, 'get_%s_%s' %
(prefix, i.replace(' ', '_')))
except AttributeError:
raise IMDbDataAccessError, 'unknown information set "%s"' % i
......@@ -310,14 +310,14 @@ class IMDbBase:
def get_imdbMovieID(self, movieID):
"""Translate a movieID in an imdbID (the ID used by the IMDb
web server; must be overridden by the subclass."""
web server); must be overridden by the subclass."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def get_imdbPersonID(self, personID):
"""Translate a personID in a imdbID (the ID used by the IMDb
web server; must be overridden by the subclass."""
web server); must be overridden by the subclass."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
......@@ -373,19 +373,20 @@ class IMDbBase:
"""Return the imdbID for the given Movie or Person object."""
imdbID = None
if mop.accessSystem == self.accessSystem:
as = self
aSystem = self
else:
as = IMDb(mop.accessSystem)
aSystem = IMDb(mop.accessSystem)
if isinstance(mop, Movie.Movie):
if mop.movieID is not None:
imdbID = as.get_imdbMovieID(mop.movieID)
imdbID = aSystem.get_imdbMovieID(mop.movieID)
else:
imdbID = as.title2imdbID(build_title(mop, canonical=1, ptdf=1))
imdbID = aSystem.title2imdbID(build_title(mop, canonical=1,
ptdf=1))
elif isinstance(mop, Person.Person):
if mop.personID is not None:
imdbID = as.get_imdbPersonID(mop.personID)
imdbID = aSystem.get_imdbPersonID(mop.personID)
else:
imdbID = as.name2imdbID(build_name(mop, canonical=1))
imdbID = aSystem.name2imdbID(build_name(mop, canonical=1))
else:
raise IMDbError, 'object ' + repr(mop) + \
' is not a Movie or Person instance'
......
......@@ -4,7 +4,7 @@ helpers module (imdb package).
This module provides functions not used directly by the imdb package,
but useful for IMDbPY-based programs.
Copyright 2006 Davide Alberani <da@erlug.linux.it>
Copyright 2006-2007 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......
......@@ -214,7 +214,7 @@ def scan_names(name_list, name1, name2, name3, results=0, ro_thresold=None):
ratios = [ratcliff(name1, nil, sm1) + 0.05]
nils = nil.split(', ', 1)
surname = nils[0]
namesurname = ''
namesurname = u''
if len(nils) == 2: namesurname = '%s %s' % (nils[1], surname)
if surname != nil:
# Distance with the "Surname" in the database.
......
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -8,7 +8,7 @@ E.g., for when searching for the title "the passion", the parsed
page would be:
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2006 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2007 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -36,15 +36,12 @@ class BasicMovieParser(ParserBase):
It's used by the HTMLSearchMovieParser class to return a result
for a direct match (when a search on IMDb results in a single
movie, the web server sends directly the movie page."""
# Do not gather names and titles references.
getRefs = 0
def _reset(self):
"""Reset the parser."""
self._result = {}
self._movieID = None
self._reading_page_title = 0
self._page_title = ''
self._page_title = u''
self._inbch = 0
self._in_series_title = 0
self._in_series_info = 0
......@@ -76,7 +73,7 @@ class BasicMovieParser(ParserBase):
n = self.get_attr_value(attrs, 'name')
if n: n = n.strip().lower()
if n in ('arg', 'auto'):
val = self.get_attr_value(attrs, 'value') or ''
val = self.get_attr_value(attrs, 'value') or u''
# XXX: use re_imdbIDonly because in the input field
# the movieID is not preceded by 'tt'.
if n == 'arg': nr = self.re_imdbIDonly.findall(val)
......@@ -113,7 +110,7 @@ class BasicMovieParser(ParserBase):
d_title = analyze_title(st, canonical=1)
m = Movie(movieID=str(self.__seriesID), data=d_title,
accessSystem='http')
self._result['kind'] = 'episode'
self._result['kind'] = u'episode'
self._result['episode of'] = m
self._series_title = u''
elif self._in_series_info:
......@@ -135,7 +132,7 @@ class BasicMovieParser(ParserBase):
self._result['season'] = season
if episode or type(season) is type(0):
self._result['episode'] = episode
self._series_info = ''
self._series_info = u''
def _handle_data(self, data):
if self._reading_page_title:
......@@ -155,21 +152,18 @@ class BasicMovieParser(ParserBase):
class HTMLSearchMovieParser(ParserBase):
"""Parse the html page that the IMDb web server shows when the
"new search system" is used."""
# Do not gather names and titles references.
getRefs = 0
def _reset(self):
"""Reset the parser."""
self._results = []
self._begin_list = 0
self._is_title = 0
self._reading_page_title = 0
self._current_imdbID = ''
self._current_title = ''
self._current_imdbID = u''
self._current_title = u''
self._no_more = 0
self._stop = 0
def parse(self, cont, results=None):
def parse(self, cont, results=None, **kwds):
self.maxres = results
return ParserBase.parse(self, cont)
......@@ -189,8 +183,8 @@ class HTMLSearchMovieParser(ParserBase):
def end_ol(self):
self._begin_list = 0
self._is_title = 0
self._current_title = ''
self._current_imdbID = ''
self._current_title = u''
self._current_imdbID = u''
def start_a(self, attrs):