Commit 613203f0 authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 2.7

parent 06871e6e
......@@ -56,12 +56,15 @@ except imdb.IMDbError, e:
sys.exit(3)
# Print the results.
print ' %s results for "%s":' % (len(results),
print ' %s result%s for "%s":' % (len(results),
('', 's')[len(results) != 1],
title.encode(out_encoding, 'replace'))
print 'movieID\t: imdbID : title'
# Print the long imdb title for every movie.
for movie in results:
print '%s: %s' % (i.get_imdbMovieID(movie.movieID),
print '%s\t: %s : %s' % (movie.movieID,
i.get_imdbID(movie),
movie['long imdb title'].encode(out_encoding, 'replace'))
......@@ -56,12 +56,15 @@ except imdb.IMDbError, e:
sys.exit(3)
# Print the results.
print ' %s results for "%s":' % (len(results),
print ' %s result%s for "%s":' % (len(results),
('', 's')[len(results) != 1],
name.encode(out_encoding, 'replace'))
print 'personID\t: imdbID : name'
# Print the long imdb name for every person.
for person in results:
print '%s: %s' % (i.get_imdbPersonID(person.personID),
print '%s\t: %s : %s' % (person.personID,
i.get_imdbID(person),
person['long imdb name'].encode(out_encoding, 'replace'))
......@@ -9,6 +9,11 @@ I'd like to thank the following people for their help:
* Ana Guerrero, for the official debian package.
* Michael G. Noll for a bug report and a patch to fix a bug
retrieving 'plot keywords'.
* Alain Michel, for a bug report about search_*.py and get_*.py scripts.
* Martin Arpon and Andreas Schoenle for bug reports (and patches)
about "runtime", "aka titles" and "production notes" information
not being parsed.
......@@ -19,6 +24,8 @@ I'd like to thank the following people for their help:
* Sebastian Pölsterl, for a bug report about the cover url for
tv (mini) series.
* Martin Kirst for many hints and the work on the imdbpyweb program.
* Julian Mayer, for a bug report and a patch about non-ascii chars.
* Wim Schut and "eccentric", for bug reports and a patches about
......
Changelog for IMDbPY
====================
* What's the new in release 2.7 "Pitch Black" (26 Sep 2006)
[general]
- fixed search_movie.py and search_person.py scripts; now they return
both the movieID/personID and the imdbID.
- the IMDbPY account was configured to hide the mini-headshots.
- http and mobile data access systems now try to handle queries
with too many results.
[http data access system]
- fixed a minor bug retrieving information about persons, with movies
in production.
- fixed support for cast list of tv series.
- fixed a bug retrieving 'plot keywords'.
- some left out company credits are now properly handled.
[mobile data access system]
- fixed a major bug with the cast list, after the changes to the
IMDb web site.
- fixed support for cast list of tv series.
- fixed a minor bug retrieving information about persons, with movies
in production.
- now every AKA title is correctly parsed.
[sql data access system]
- fixed a(nother) bug updating imdbID for movies and persons.
- fixed a bug retrieving personID, while handling names references.
[local data access system]
- "where now" information now correctly handles multiple lines (also
affecting the imdbpy2sql.py script).
* What's the new in release 2.6 "They Live" (04 Jul 2006)
[general]
- renamed sortMovies to cmpMovies and sortPeople to cmpPeople; these
......
......@@ -13,7 +13,11 @@ NOTE: it's always time to clean the code! <g>
* Compatibility with Python 2.2 and previous versions is no more assured
for every data access system (the imdbpy2sql.py script for sure
requires at least Python 2.3).
* The analyze_title/build_title functions are grown too complex.
* The analyze_title/build_title functions are grown too complex and
beyond their initial goals.
* the 'year' keyword can probably be an int, instead of a string;
the '????' case can be handled directly by the analyze_title/build_title
functions.
* for local and sql data access systems: some episode titles are
marked as {{SUSPENDED}}; they should probably be ignored.
......@@ -37,6 +41,8 @@ NOTE: it's always time to clean the code! <g>
notes ("written by", "as Aka Name", ...)
* The 'laserdisc' information for 'local' and 'sql' is probabily
wrong: I think they merge data from different laserdisc titles.
* there are links to hollywoodreporter.com that are not bathered in
the "external reviews" page.
[Person objects]
......
......@@ -85,8 +85,8 @@ class Movie(_Container):
'miscellaneouscrew': 'miscellaneous crew',
'crewmembers': 'miscellaneous crew',
'crew members': 'miscellaneous crew',
'other companies': 'miscellaneous companies',
'misc companies': 'miscellaneous companies',
'misc companies': 'other companies',
'miscellaneous companies': 'other companies',
'aka': 'akas',
'also known as': 'akas',
'country': 'countries',
......@@ -102,6 +102,9 @@ class Movie(_Container):
'soundclips': 'sound clips',
'videoclips': 'video clips',
'photographs': 'photo sites',
'distributor': 'distributors',
'distribution': 'distributors',
'distribution companies': 'distributors',
'guest': 'guests',
'guest appearances': 'guests',
'tv guests': 'guests',
......
......@@ -77,14 +77,16 @@ class IMDbURLopener(FancyURLopener):
c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu)
self.addheaders.append(('Cookie', c_header))
def retrieve_unicode(self, url):
def retrieve_unicode(self, url, size=-1):
"""Retrieves the given URL, and returns a unicode string,
trying to guess the encoding of the data (assuming latin_1
by default)"""
encode = None
try:
if size != -1:
self.addheader('Range', 'bytes=0-%d' % size)
uopener = self.open(url)
content = uopener.read()
content = uopener.read(size=size)
server_encode = uopener.info().getparam('charset')
# look at the content-type HTML meta tag.
if server_encode is None and content:
......@@ -101,8 +103,16 @@ class IMDbURLopener(FancyURLopener):
except (LookupError, ValueError, TypeError):
pass
uopener.close()
if size != -1:
for index in xrange(len(self.addheaders)):
if self.addheaders[index][0] == 'Range':
del self.addheaders[index]
self.close()
except IOError, e:
if size != -1:
for index in xrange(len(self.addheaders)):
if self.addheaders[index][0] == 'Range':
del self.addheaders[index]
raise IMDbDataAccessError, {'errcode': e.errno,
'errmsg': str(e.strerror),
'url': url,
......@@ -219,17 +229,36 @@ class IMDbHTTPAccessSystem(IMDbBase):
c_header = 'id=%s; uu=%s' % (cookie_id, cookie_uu)
self.urlOpener.addheaders += [('Cookie', c_header)]
def _retrieve(self, url):
def _retrieve(self, url, size=-1):
"""Retrieve the given URL."""
return self.urlOpener.retrieve_unicode(url)
return self.urlOpener.retrieve_unicode(url, size=size)
def _get_search_content(self, kind, ton, results):
"""Retrieve the web page for a given search.
kind can be tt (for titles) or nm (for names)
ton is the title or the name to search.
results is the maximum number of results to be retrieved."""
params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results))
cont = self._retrieve(imdbURL_search % params)
if cont.find('more than 500 partial matches') == -1:
return cont
# The retrieved page contains no results, because too many
# titles or names contain the string we're looking for.
if kind == 'nm':
params = 'q=%s;more=nm' % quote_plus(ton)
else:
params = 'q=%s;more=tt' % quote_plus(ton)
size = 22528 + results * 512
return self._retrieve(imdbURL_search % params, size=size)
def _search_movie(self, title, results):
# The URL of the query.
# XXX: To retrieve the complete results list:
# params = urllib.urlencode({'more': 'tt', 'q': title})
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
params = 'q=%s&tt=on&mx=%s' % (quote_plus(title), str(results))
cont = self._retrieve(imdbURL_search % params)
#params = 'q=%s&tt=on&mx=%s' % (quote_plus(title), str(results))
#cont = self._retrieve(imdbURL_search % params)
cont = self._get_search_content('tt', title, results)
return search_movie_parser.parse(cont, results=results)['data']
def get_movie_main(self, movieID):
......@@ -253,7 +282,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
def get_movie_keywords(self, movieID):
cont = self._retrieve(imdbURL_movie % movieID + 'keywords')
return taglines_parser.parse(cont)
return keywords_parser.parse(cont)
def get_movie_alternate_versions(self, movieID):
cont = self._retrieve(imdbURL_movie % movieID + 'alternateversions')
......@@ -389,8 +418,9 @@ class IMDbHTTPAccessSystem(IMDbBase):
# XXX: To retrieve the complete results list:
# params = urllib.urlencode({'more': 'nm', 'q': name})
##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
params = 'q=%s&nm=on&mx=%s' % (quote_plus(name), str(results))
cont = self._retrieve(imdbURL_search % params)
#params = 'q=%s&nm=on&mx=%s' % (quote_plus(name), str(results))
#cont = self._retrieve(imdbURL_search % params)
cont = self._get_search_content('nm', name, results)
return search_person_parser.parse(cont, results=results)['data']
def get_person_main(self, personID):
......
......@@ -116,6 +116,8 @@ class HTMLMovieParser(ParserBase):
self._is_mpaa = 0
self._mpaa = ''
self._inbch = 0
self._in_blackcatheader = 0
self._cur_blackcatheader = ''
self._isplotoutline = 0
self._plotoutline = u''
# If true, the next data should be merged with the previous one,
......@@ -222,6 +224,12 @@ class HTMLMovieParser(ParserBase):
if ind != -1:
sect = sect[:ind]
self._current_section = sect
elif link.startswith('/company') and self._cur_blackcatheader:
self._is_company_cred = 1
self._current_section = self._cur_blackcatheader.lower()
# To not override the other section with the same name.
if self._current_section == 'special effects':
self._current_section = 'special effects companies'
# Sections like 'cast', 'director', 'writer', etc. all
# begin with a link to a "/Glossary" page.
elif link.startswith('/glossary'):
......@@ -433,12 +441,13 @@ class HTMLMovieParser(ParserBase):
self._company_data = strip_amps(self._company_data)
self.append_item(self._current_section, self._company_data)
self._company_data = ''
self._is_company_cred = 0
#self._is_company_cred = 0
def start_ul(self, attrs): pass
def end_ul(self):
self._is_company_cred = 0
self._cur_blackcatheader = ''
def start_b(self, attrs):
self._is_akas = 0
......@@ -447,11 +456,15 @@ class HTMLMovieParser(ParserBase):
cls = cls.lower()
if cls == 'ch':
self._inbch = 1
elif self.mdparse and cls == 'blackcatheader':
self.end_table()
elif cls == 'blackcatheader':
self._in_blackcatheader = 1
self._cur_blackcatheader = ''
if self.mdparse:
self.end_table()
def end_b(self):
if self._inbch: self._inbch = 0
if self._in_blackcatheader: self._in_blackcatheader = 0
def do_img(self, attrs):
alttex = self.get_attr_value(attrs, 'alt')
......@@ -571,6 +584,13 @@ class HTMLMovieParser(ParserBase):
self._in_series_title = 1
elif sldata.startswith('original air date'):
self._in_series_info = 1
elif self._in_blackcatheader:
# An hack to support also the tv series' pages.
if sldata == 'cast':
self._is_cast_crew = 1
self._current_section = 'cast'
else:
self._cur_blackcatheader += data
if self.mdparse:
if sldata.startswith('cast overview, first billed only'):
self._is_cast_crew = 1
......
......@@ -67,11 +67,13 @@ class HTMLMaindetailsParser(ParserBase):
self._get_imdbID = 0
self._in_sect = 0
self._in_b = 0
self._in_i = 0
self._sect_name = ''
self._in_emailfriend = 0
self._in_akas = 0
self._aka = ''
self._aka = u''
self._akas = []
self._cur_status = u''
def get_data(self):
"""Return the dictionary."""
......@@ -171,6 +173,12 @@ class HTMLMaindetailsParser(ParserBase):
self._in_sect = 0
self._sect_name = ''
def start_i(self, attrs):
self._in_i = 1
def end_i(self):
self._in_i = 0
def start_li(self, attrs):
self._in_list = 1
self._seen_br = 0
......@@ -203,18 +211,37 @@ class HTMLMaindetailsParser(ParserBase):
elif self._roles.startswith('(VG)'):
tit += ' (VG)'
self._roles = self._roles[4:].strip()
sp = self._roles.find('(')
if sp != -1:
ep = self._roles.rfind(')')
if ep != -1:
notes = self._roles[sp:ep+1]
self._roles = self._roles[:sp-1].strip()
if self._roles.startswith('.... '):
self._roles = self._roles[5:]
has_note = 0
if self._roles.find('(') != -1: has_note = 1
sp = self._roles.split('....')
if len(sp) == 2:
notes = sp[0].strip().replace(' ', ' ')
self._roles = sp[1].strip()
fn = self._roles.find('(')
if fn != -1:
en = self._roles.rfind(')')
pnote = self._roles[fn:en+1].strip()
self._roles = '%s %s' % (self._roles[:fn].strip(),
self._roles[en+1:].strip())
self._roles = self._roles.strip()
if pnote:
if notes: notes += ' '
notes += pnote
else:
notes = ''.join(sp).strip().replace(' ', ' ')
self._roles = ''
movie = Movie(movieID=str(self._last_imdbID), title=tit,
accessSystem='http')
if notes: movie.notes = notes
movie.currentRole = self._roles
self._cur_status = self._cur_status.strip()
if self._cur_status:
if self._cur_status[0] == '(':
self._cur_status = self._cur_status[1:]
if self._cur_status[-1] == ')':
self._cur_status = self._cur_status[:-1]
movie['status'] = '%s' % self._cur_status
self._cur_status = u''
sect = self._sect_name.strip().lower()
self._person_data.setdefault(sect, []).append(movie)
self._title = ''
......@@ -277,7 +304,10 @@ class HTMLMaindetailsParser(ParserBase):
elif self._in_title and not self._seen_br:
self._title += data
elif self._in_list:
self._roles += data
if self._in_i:
self._cur_status += data
else:
self._roles += data
class HTMLBioParser(ParserBase):
......
......@@ -154,8 +154,6 @@ def _parseBiography(biol):
res.setdefault('pictorials', []).append(x[6:].strip())
elif x6 == 'CV: * ':
res.setdefault('magazine covers', []).append(x[6:].strip())
elif x6 == 'WN: * ':
res.setdefault('where now', []).append(x[6:].strip())
elif x4 == 'NK: ':
res.setdefault('nick names', []).append(normalizeName(x[4:]))
elif x6 == 'PI: * ':
......@@ -174,6 +172,8 @@ def _parseBiography(biol):
if books: res['books'] = books
agent = _parseList(biol, 'AG')
if agent: res['agent address'] = agent
wherenow = _parseList(biol, 'WN')
if wherenow: res['where now'] = wherenow[0]
biomovies = _parseList(biol, 'BT')
if biomovies: res['biographical movies'] = biomovies
guestapp = _buildGuests([x[6:].strip() for x in biol if x[:6] == 'GA: * '])
......
......@@ -106,12 +106,15 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
IMDbHTTPAccessSystem.__init__(self, isThin, *arguments, **keywords)
self.accessSystem = 'mobile'
def _mretrieve(self, url):
def _clean_html(self, html):
"""Normalize the retrieve html."""
html = re_spaces.sub(' ', html)
return subXMLRefs(html)
def _mretrieve(self, url, size=-1):
"""Retrieve an html page and normalize it."""
cont = IMDbHTTPAccessSystem._retrieve(self, url)
cont = re_spaces.sub(' ', cont)
cont = subXMLRefs(cont)
return cont
cont = self._retrieve(url, size=size)
return self._clean_html(cont)
def _getPersons(self, s, sep='<br>', hasCr=0, aonly=0):
"""Return a list of Person objects, from the string s; items
......@@ -143,6 +146,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
if aonly:
stripped = _findBetween(name, '>', '</a>')
if len(stripped) == 1: name = stripped[0]
if name[0:1] == '>': name = name[1:]
name = _unHtml(name)
if not (pid and name): continue
plappend(Person(personID=str(pid[0]), name=name,
......@@ -154,8 +158,9 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
def _search_movie(self, title, results):
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
cont = self._mretrieve(imdbURL_search % params)
#params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
#cont = self._mretrieve(imdbURL_search % params)
cont = self._get_search_content('tt', title, results)
title = _findBetween(cont, '<title>', '</title>')
res = []
if not title: return res
......@@ -287,23 +292,30 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
castdata = _findBetween(cont, 'Credited cast', '</table>')
if not castdata:
castdata = _findBetween(cont, 'Complete credited cast', '</table>')
if not castdata:
castdata = _findBetween(cont, 'Series Cast Summary', '</table>')
if castdata:
castdata = castdata[0]
fl = castdata.find('href=')
if fl != -1: castdata = '< a' + castdata[fl:]
if fl != -1: castdata = '<a ' + castdata[fl:]
smib = castdata.find('<tr><td align="center" colspan="3"><small>')
if smib != -1:
smie = castdata.rfind('</small></td></tr>')
if smie != -1:
castdata = castdata[:smib].strip() + \
castdata[smie+18:].strip()
cast = self._getPersons(castdata, sep='</tr><tr>', hasCr=1)
castdata = castdata.replace(' bgcolor="#F0F0F0"', '')
castdata = castdata.replace(' bgcolor="#FFFFFF"', '')
castdata = castdata.replace('/tr> <tr', '/tr><tr')
cast = self._getPersons(castdata, sep='</tr><tr', hasCr=1)
if cast: d['cast'] = cast
# FIXME: doesn't catch "complete title", which is not
# included in <i> tags.
# See "Gehr Nany Fgbevrf 11", movieID: 0282910
akas = _findBetween(cont, '<i class="transl">', '<br')
akas = _findBetween(cont, '<b class="ch">Also Known As:</b>',
'<b class="ch">')
if akas:
akas[:] = [x for x in akas[0].split('<br>') if x.strip()]
akas = [_unHtml(x).replace(' (','::(', 1).replace(' [','::[')
for x in akas]
d['akas'] = akas
......@@ -359,8 +371,9 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
def _search_person(self, name, results):
##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
cont = self._mretrieve(imdbURL_search % params)
#params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
#cont = self._mretrieve(imdbURL_search % params)
cont = self._get_search_content('nm', name, results)
name = _findBetween(cont, '<title>', '</title>')
res = []
if not name: return res
......@@ -491,12 +504,28 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
role = u''
if not istvguest:
ms = m.split('....')
if len(ms) >= 1:
first = ms[0]
if first and first[0] == '(':
notes = first.strip()
ms = ms[1:]
if ms: role = ' '.join(ms).strip()
if len(ms) == 2:
notes = ms[0].strip().replace(' ', ' ')
role = ms[1].strip()
fn = role.find('(')
if fn != -1:
en = role.rfind(')')
pnote = role[fn:en+1].strip()
role = '%s %s' % (role[:fn].strip(),
role[en+1:].strip())
role = role.strip()
if pnote:
if notes: notes += ' '
notes += pnote
else:
notes = ''.join(ms).strip().replace(' ', ' ')
role = u''
#if len(ms) >= 1:
# first = ms[0]
# if first and first[0] == '(':
# notes = first.strip()
# ms = ms[1:]
#if ms: role = ' '.join(ms).strip()
else:
# XXX: strip quotes from strings like "Himself"?
noteidx = m.find('(')
......
......@@ -210,7 +210,7 @@ class IMDbSqlAccessSystem(IMDbLocalAndSqlAccessSystem):
None if not found."""
nd = analyze_name(name)
res = Name.select(AND(Name.q.name == nd['name'].encode('utf_8'),
Name.q.imdbIndex == nd.get('imdbIndex')))
Name.q.imdbIndex == str(nd.get('imdbIndex'))))
if res.count() != 1:
return None
return res[0].id
......@@ -280,8 +280,8 @@ class IMDbSqlAccessSystem(IMDbLocalAndSqlAccessSystem):
# There're times when I think I'm a genius; this one of
# those times... <g>
if imdbID is not None:
try: movie.imdbID = imdbID
except self.Error: pass
try: movie.imdbID = int(imdbID)
except: pass
return imdbID
def get_imdbPersonID(self, personID):
......@@ -297,8 +297,8 @@ class IMDbSqlAccessSystem(IMDbLocalAndSqlAccessSystem):
namline = build_name(n_dict, canonical=1)
imdbID = self.name2imdbID(namline)
if imdbID is not None:
try: person.imdbID = imdbID
except self.Error: pass
try: person.imdbID = int(imdbID)
except: pass
return imdbID
def do_adult_search(self, doAdult):
......
......@@ -33,8 +33,8 @@ DO_SCRIPTS = 1
# --- NOTHING TO CONFIGURE BELOW.
# version of the software; CVS releases contain a string
# like "-cvsYearMonthDay-OptionalChar".
version = '2.6'
# like ".cvsYearMonthDay(OptionalChar)".
version = '2.7'
home_page = 'http://imdbpy.sf.net/'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment