Commit 3b788bce authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 3.1

parent 90a7ba5b
...@@ -29,8 +29,9 @@ from gzip import GzipFile ...@@ -29,8 +29,9 @@ from gzip import GzipFile
from types import UnicodeType from types import UnicodeType
from sqlobject import * from sqlobject import *
from sqlobject.sqlbuilder import ISNOTNULL
from imdb.parser.sql import soundex from imdb.parser.sql import soundex, get_movie_data
from imdb.parser.sql.dbschema import * from imdb.parser.sql.dbschema import *
from imdb.utils import analyze_title, analyze_name, \ from imdb.utils import analyze_title, analyze_name, \
build_name, build_title, normalizeName, _articles build_name, build_title, normalizeName, _articles
...@@ -497,10 +498,11 @@ class _BaseCache(dict): ...@@ -497,10 +498,11 @@ class _BaseCache(dict):
def fetchsome(curs, size=20000): def fetchsome(curs, size=20000):
"""Yes, I've read the Python Cookbook! :-)""" """Yes, I've read the Python Cookbook! :-)"""
while 1: while 1:
res = CURS.fetchmany(size) res = curs.fetchmany(size)
if not res: break if not res: break
for r in res: yield r for r in res: yield r
class MoviesCache(_BaseCache): class MoviesCache(_BaseCache):
"""Manage the movies list.""" """Manage the movies list."""
className = 'MoviesCache' className = 'MoviesCache'
...@@ -547,7 +549,8 @@ class MoviesCache(_BaseCache): ...@@ -547,7 +549,8 @@ class MoviesCache(_BaseCache):
if series_d['year'] is None: del series_d['year'] if series_d['year'] is None: del series_d['year']
else: series_d['year'] = str(series_d['year']) else: series_d['year'] = str(series_d['year'])
mdict['episode of'] = series_d mdict['episode of'] = series_d
title = build_title(mdict, canonical=1, ptdf=1) title = build_title(mdict, canonical=1, ptdf=1,
_emptyString='')
dict.__setitem__(self, title, x[0]) dict.__setitem__(self, title, x[0])
self.counter = counter(Title.select().count() + 1) self.counter = counter(Title.select().count() + 1)
Title.sqlmeta.cacheValues = _oldcacheValues Title.sqlmeta.cacheValues = _oldcacheValues
...@@ -572,7 +575,8 @@ class MoviesCache(_BaseCache): ...@@ -572,7 +575,8 @@ class MoviesCache(_BaseCache):
kind = tget('kind') kind = tget('kind')
if kind == 'episode': if kind == 'episode':
#series title #series title
stitle = build_title(tget('episode of'), canonical=1) stitle = build_title(tget('episode of'), canonical=1,
_emptyString='')
episodeOf = self.addUnique(stitle) episodeOf = self.addUnique(stitle)
del t['episode of'] del t['episode of']
year = self.movieYear.get(v) year = self.movieYear.get(v)
...@@ -822,7 +826,7 @@ def doCast(fp, roleid, rolename): ...@@ -822,7 +826,7 @@ def doCast(fp, roleid, rolename):
for item in ll[1:]: for item in ll[1:]:
if not item: continue if not item: continue
if item[0] == '[': if item[0] == '[':
role = item[1:-1] role = item[1:].rstrip(']')
if role[-1:] == ')': if role[-1:] == ')':
nidx = role.find('(') nidx = role.find('(')
if nidx != -1: if nidx != -1:
...@@ -1423,7 +1427,6 @@ CACHE_MID = MoviesCache() ...@@ -1423,7 +1427,6 @@ CACHE_MID = MoviesCache()
CACHE_PID = PersonsCache() CACHE_PID = PersonsCache()
def _cmpfunc(x, y): def _cmpfunc(x, y):
"""Sort a list of tuples, by the length of the first item (in reverse).""" """Sort a list of tuples, by the length of the first item (in reverse)."""
lx = len(x[0]) lx = len(x[0])
...@@ -1457,22 +1460,102 @@ def readConstants(): ...@@ -1457,22 +1460,102 @@ def readConstants():
CCAST_TYPES[x.kind] = x.id CCAST_TYPES[x.kind] = x.id
def notNULLimdbID(cls):
"""Return a list of dictionaries for titles or names for which a
imdbID is present in the database."""
if cls is Title: cname = 'movies'
else: cname = 'people'
print 'SAVING imdbID values for %s...' % cname,
sys.stdout.flush()
try:
tons = cls.select(ISNOTNULL(cls.q.imdbID))
except:
print 'SKIPPING: no data.'
return []
results = []
_kdict = {}
try:
for x in KindType.select():
_kdict[x.id] = x.kind
except:
print 'SKIPPING: no data.'
return []
for t in tons:
if cls is Title:
md = get_movie_data(t.id, _kdict)
else:
md = {'name': t.name}
if t.imdbIndex is not None:
md['imdbIndex'] = t.imdbIndex
md['imdbID'] = t.imdbID
results.append(md)
print 'DONE! (%d entries)' % len(results)
return results
def restoreImdbID(tons, cls):
"""Restore imdbID for movies or people."""
if cls is Title:
CACHE = CACHE_MID
cname = 'movies'
else:
CACHE = CACHE_PID
cname = 'people'
print 'RESTORING imdbID values for %s...' % cname,
sys.stdout.flush()
count = 0
for t in tons:
if cls is Title:
t_str = build_title(t, canonical=1, ptdf=1)
else:
t_str = build_name(t, canonical=1)
t_str = t_str.encode('utf_8')
db_mopID = CACHE.get(t_str)
if db_mopID is None:
continue
try:
mop_in_db = cls.get(db_mopID)
try:
mop_in_db.imdbID = t['imdbID']
except:
continue
except SQLObjectNotFound:
continue
count += 1
print 'DONE! (restored %d entries out of %d)' % (count, len(tons))
# begin the iterations... # begin the iterations...
def run(): def run():
print 'RUNNING imdbpy2sql.py' print 'RUNNING imdbpy2sql.py'
# Storing imdbIDs for movies and persons.
try:
movies_imdbIDs = notNULLimdbID(Title)
except:
movies_imdbIDs = []
print 'WARNING: failed to read imdbIDs for movies'
try:
people_imdbIDs = notNULLimdbID(Name)
except:
people_imdbIDs = []
print 'WARNING: failed to read imdbIDs for people'
# Truncate the current database. # Truncate the current database.
print 'DROPPING current database...', print 'DROPPING current database...',
sys.stdout.flush() sys.stdout.flush()
dropTables() dropTables()
print 'done!' print 'DONE!'
# Rebuild the database structure. # Rebuild the database structure.
print 'CREATING new tables...', print 'CREATING new tables...',
sys.stdout.flush() sys.stdout.flush()
createTables() createTables()
print 'DONE!'
t('dropping and recreating the database')
# Read the constants. # Read the constants.
readConstants() readConstants()
print 'done!'
t('dropping and recreating the database')
# Populate the CACHE_MID instance. # Populate the CACHE_MID instance.
readMovieList() readMovieList()
...@@ -1522,6 +1605,18 @@ def run(): ...@@ -1522,6 +1605,18 @@ def run():
completeCast() completeCast()
t('completeCast()') t('completeCast()')
# Restoring imdbIDs for movies and persons.
try:
restoreImdbID(movies_imdbIDs, Title)
del movies_imdbIDs
except:
print 'WARNING: failed to restore imdbIDs for movies'
try:
restoreImdbID(people_imdbIDs, Name)
del people_imdbIDs
except:
print 'WARNING: failed to restore imdbIDs for people'
# Flush caches. # Flush caches.
CACHE_MID.flush() CACHE_MID.flush()
CACHE_MID.clear() CACHE_MID.clear()
......
...@@ -4,8 +4,28 @@ share the copyright over some portions of the code: ...@@ -4,8 +4,28 @@ share the copyright over some portions of the code:
NAME: Giuseppe "Cowo" Corbelli NAME: Giuseppe "Cowo" Corbelli
EMAIL: <cowo --> lugbs.linux.it> EMAIL: <cowo --> lugbs.linux.it>
DESCRIPTION: provided a lot of code and hints to integrate IMDbPY CONTRIBUTION: provided a lot of code and hints to integrate IMDbPY
with SQLObject, working on the imdbpy2sql.py script and the dbschema.py with SQLObject, working on the imdbpy2sql.py script and the dbschema.py
module. module.
Actually, besides Giuseppe and me, these other people are listed
as developers for the IMDbPY project on sourceforge and may share
copyright on some (minor) portions of the code:
NAME: Martin Kirst
EMAIL: <martin.kirst --> s1998.tu-chemnitz.de>
CONTRIBUTION: has done an important refactoring of the imdbpyweb
program and shares with me the copyright on the whole program.
NAME: H. Turgut Uyar
EMAIL: <uyar --> itu.edu.tr>
CONTRIBUTION: has created some tests for the test-suite.
NAME: Jesper Nøhr
EMAIL: <jesper --> noehr.org>
CONTRIBUTION: provided extensive testing and some patches for
the 'http' data access system.
...@@ -15,11 +15,22 @@ I'd like to thank the following people for their help: ...@@ -15,11 +15,22 @@ I'd like to thank the following people for their help:
* Jesper Nøhr for a lot of testing, especially on the 'sql'. * Jesper Nøhr for a lot of testing, especially on the 'sql'.
* Jon Sabo for a bug report about unicode and the imdbpy2sql.py script
and some feedback.
* Andrew Pendleton for a report about a very hideous bug in
the imdbpy2sql.py (garbage in the plain text data files + programming
errors + utf8 strings + postgres).
* Ataru Moroboshi ;-) for a bug report about role/duty and notes.
* Ivan Kedrin for a bug report about the analyze_title function. * Ivan Kedrin for a bug report about the analyze_title function.
* Hadley Rich for reporting bugs and providing patches for troubles * Hadley Rich for reporting bugs and providing patches for troubles
parsing tv series' episodes and searching for tv series' titles. parsing tv series' episodes and searching for tv series' titles.
* Jamie R. Rytlewski for a suggestion about saving imbIDs in 'sql'.
* Vincent Crevot, for a bug report about unicode support. * Vincent Crevot, for a bug report about unicode support.
* Jay Klein for a bug report and testing to fix a nasty bug in the * Jay Klein for a bug report and testing to fix a nasty bug in the
......
Changelog for IMDbPY Changelog for IMDbPY
==================== ====================
* What's the new in release 3.1 "The Snake King" (18 Jul 2007)
[global]
- the IMDbPYweb account now returns a single item, when a search
returns only one "good enough" match (this is the IMDb's default).
- updated the documentation.
- updated list of contributors and developers.
[http]
- supported the new result page for searches.
- supported the 'synopsis' page.
- supported the 'parents guide' page.
- fixed a bug retrieving notes about a movie's connections.
- fixed a bug for python2.2 (s60 mobile phones).
- fixed a bug with 'Production Notes/Status'.
- fixed a bug parsing role/duty and notes (also for httpThin).
- fixed a bug retrieving user ratings.
- fixed a bug (un)setting the proxy.
- fixed 2 bugs in movie/person news.
- fixed a bug in movie faqs.
- fixed a bug in movie taglines.
- fixed a bug in movie quotes.
- fixed a bug in movie title, in "full cast and crew" page.
- fixed 2 bugs in persons' other works.
[sql]
- hypothetical fix for a unicode problem in the imdbpy2sql.py script.
- now the 'imdbID' fields in the Title and Name tables are restored,
updating from an older version.
- fixed a nasty bug handling utf-8 strings in the imdbpy2sql.py script.
[mobile]
- supported the new result page for searches.
- fixed a bug for python2.2 (s60 mobile phones).
- fixed a bug searching for persons with single match and no
messages in the board.
- fixed a bug parsing role/duty and notes.
* What's the new in release 3.0 "Spider-Man 3" (03 May 2007) * What's the new in release 3.0 "Spider-Man 3" (03 May 2007)
[global] [global]
- IMDbPY now works with the new IMDb's site design; a new account is - IMDbPY now works with the new IMDb's site design; a new account is
......
...@@ -63,7 +63,8 @@ the movies, only the main information are retrieved (see the 'httpThin' ...@@ -63,7 +63,8 @@ the movies, only the main information are retrieved (see the 'httpThin'
notes). It should be, at usage time, from 2 to 20 times faster than notes). It should be, at usage time, from 2 to 20 times faster than
the "http"/"httpThin" data access system. the "http"/"httpThin" data access system.
This code is still BETA! Please report me bugs/ideas/hints... This code still needs tests on mobile phones!
Please report any bugs/ideas/hints...
Usage: Usage:
from imdb import IMDb from imdb import IMDb
...@@ -79,7 +80,12 @@ Usage: ...@@ -79,7 +80,12 @@ Usage:
A GUI for Series 60 smart phones, developed by Tero Saarni, A GUI for Series 60 smart phones, developed by Tero Saarni,
is available at: is available at:
http://kotisivu.mtv3.fi/terosaarni/python/imdbpygui/ http://imdbpy.sourceforge.net/?page=mobile
On some mobile phone a pair of modules can be missing, and
you have to install it manually as libraries; you can find
these two modules (sgmllib.py and htmlentitydefs.py) here:
http://imdbpy.sourceforge.net/symbiangui/mobile-imdbpy-modules-0.1.tar.gz
THE "HTTPTHIN" DATA ACCESS SYSTEM THE "HTTPTHIN" DATA ACCESS SYSTEM
......
...@@ -3,19 +3,12 @@ ...@@ -3,19 +3,12 @@
======================== ========================
On 19 February 2007, IMDb introduced a complete redesign of their On 19 February 2007, IMDb introduced a complete redesign of their
web site. This means that the 'http' and 'mobile' parser are no web site.
more able to parse the new html; as a temporary solution, the account
used by IMDbPY was set to "use previous layout", meaning that - for
a certain amount of time - the current IMDbPY version (2.9) will work.
This (2.9) will be the last version of IMDbPY to parse the old layout: Since release 3.0, IMDbPY uses a new account to access the IMDb
from now on, on the CVS, the development will be geared to use the new web site, parsing the new layout.
layout - and a new IMDb's account will be used.
Conclusion: if you find a bug in 'http' or 'mobile' in this release,
please report it anyway (it can also affect the new code), but consider
that a bit of time will be needed, to fix everything.
Even better, help the development subscribing to the mailing list:
http://imdbpy.sourceforge.net/?page=devel
Older version still access the old layout, so they are still (more
or less) working; obviously only the new layout is sopport from
now on.
...@@ -97,7 +97,7 @@ complete plain text data files set (as of 12 Nov 2006, with about ...@@ -97,7 +97,7 @@ complete plain text data files set (as of 12 Nov 2006, with about
database | time in minutes: total (insert data/create indexes) database | time in minutes: total (insert data/create indexes)
----------------------+----------------------------------------------------- ----------------------+-----------------------------------------------------
MySQL 5.0 MyISAM | 115 (95/20) MySQL 5.0 MyISAM | 115 (95/20)
MYSQL 5.0 InnoDB | ??? (80/???) MySQL 5.0 InnoDB | ??? (80/???)
| maybe I've not cofigurated it properly: it | maybe I've not cofigurated it properly: it
| looks like the creation of the indexes will | looks like the creation of the indexes will
| take more than 2 or 3 hours. | take more than 2 or 3 hours.
...@@ -116,6 +116,35 @@ If you have different experiences, please tell me! ...@@ -116,6 +116,35 @@ If you have different experiences, please tell me!
The imdbpy2sql.py will print a lot of debug information on standard output; The imdbpy2sql.py will print a lot of debug information on standard output;
you can save it in a file, appending (without quotes) "2>&1 | tee output.txt" you can save it in a file, appending (without quotes) "2>&1 | tee output.txt"
[sqlite failure]
It seems that, with older versions of the python-sqlite package, the first
run may fail; if you get a DatabaseError exception saying "no such table",
try running again the command with the same arguments.
[data truncated]
If you get an insane amount (hundreds or thousands, on various text
columns) of warnings like these lines:
imdbpy2sql.py:727: Warning: Data truncated for column 'person_role' at row 4979
CURS.executemany(self.sqlString, self.converter(self.values()))
you probably have a problem with the configuration of your database.
The error came from strings that get cut at the first non-ASCII char (and
so you're losing a lot of information).
To obviate at this problem, you must be sure that your database
server is set up properly, with the use library/client configured
to communicate with the server in a consistent way.
E.g., for MySQL you can set:
character-set-server = utf8
default-collation = utf8_unicode_ci
default-character-set = utf8
of even:
character-set-server = latin1
default-collation = latin1_bin
default-character-set = latin1
[adult titles] [adult titles]
Beware that, while running, the imdbpy2sql.py script will output a lot Beware that, while running, the imdbpy2sql.py script will output a lot
of strings containing both person names and movie titles. The script of strings containing both person names and movie titles. The script
...@@ -125,14 +154,6 @@ screaming 'daddy! daddy! what kind of animals trains Rocco in the ...@@ -125,14 +154,6 @@ screaming 'daddy! daddy! what kind of animals trains Rocco in the
documentary "Rocco: Animal Trainer 17"???'... well it's not my fault! ;-) documentary "Rocco: Animal Trainer 17"???'... well it's not my fault! ;-)
SQLITE NOTE
===========
It seems that, with older versions of the python-sqlite package, the first
run may fail; if you get a DatabaseError exception saying "no such table",
try running again the command with the same arguments.
SQL USAGE SQL USAGE
========= =========
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -121,6 +121,7 @@ class Movie(_Container): ...@@ -121,6 +121,7 @@ class Movie(_Container):
'merchandise': 'merchandising links', 'merchandise': 'merchandising links',
'sales': 'merchandising links', 'sales': 'merchandising links',
'faq': 'faqs', 'faq': 'faqs',
'parental guide': 'parents guide',
'frequently asked questions': 'faqs'} 'frequently asked questions': 'faqs'}
keys_tomodify_list = ('plot', 'trivia', 'alternate versions', 'goofs', keys_tomodify_list = ('plot', 'trivia', 'alternate versions', 'goofs',
......
...@@ -345,6 +345,13 @@ class IMDbBase: ...@@ -345,6 +345,13 @@ class IMDbBase:
title = title.encode('utf-8') title = title.encode('utf-8')
params = 'q=%s;s=pt' % str(urllib.quote_plus(title)) params = 'q=%s;s=pt' % str(urllib.quote_plus(title))
content = self._searchIMDb(params) content = self._searchIMDb(params)
if content and content[:512].find('<title>IMDb Title') != -1:
# Sometimes (e.g.: for titles with a "+" in it) a list
# of results is returned even for Exact Primary searches;
# this try to deal with it, hoping that a "normal" query
# will result in just on title.
params = 's=tt&q=%s' % str(urllib.quote_plus(title))
content = self._searchIMDb(params)
if not content: return None if not content: return None
from imdb.parser.http.searchMovieParser import BasicMovieParser from imdb.parser.http.searchMovieParser import BasicMovieParser
mparser = BasicMovieParser() mparser = BasicMovieParser()
...@@ -362,6 +369,9 @@ class IMDbBase: ...@@ -362,6 +369,9 @@ class IMDbBase:
name = name.encode('utf-8') name = name.encode('utf-8')
params = 'q=%s;s=pn' % str(urllib.quote_plus(name)) params = 'q=%s;s=pn' % str(urllib.quote_plus(name))
content = self._searchIMDb(params) content = self._searchIMDb(params)
if content and content[:512].find('<title>IMDb Name') != -1:
params = 's=nm&q=%s' % str(urllib.quote_plus(name))
content = self._searchIMDb(params)
if not content: return None if not content: return None
from imdb.parser.http.searchPersonParser import BasicPersonParser from imdb.parser.http.searchPersonParser import BasicPersonParser
pparser = BasicPersonParser() pparser = BasicPersonParser()
......
...@@ -24,6 +24,7 @@ along with this program; if not, write to the Free Software ...@@ -24,6 +24,7 @@ along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
""" """
import sys
from urllib import FancyURLopener, quote_plus from urllib import FancyURLopener, quote_plus
from codecs import lookup from codecs import lookup
...@@ -43,7 +44,8 @@ from movieParser import movie_parser, plot_parser, movie_awards_parser, \ ...@@ -43,7 +44,8 @@ from movieParser import movie_parser, plot_parser, movie_awards_parser, \
soundclips_parser, videoclips_parser, news_parser, \ soundclips_parser, videoclips_parser, news_parser, \
photosites_parser, amazonrev_parser, business_parser, \ photosites_parser, amazonrev_parser, business_parser, \
literature_parser, sales_parser, episodes_parser, \ literature_parser, sales_parser, episodes_parser, \
eprating_parser, movie_faqs_parser, airing_parser eprating_parser, movie_faqs_parser, airing_parser, \
synopsis_parser, parentsguide_parser
from searchMovieParser import search_movie_parser from searchMovieParser import search_movie_parser
from personParser import maindetails_parser, bio_parser, \ from personParser import maindetails_parser, bio_parser, \
otherworks_parser, person_awards_parser, \ otherworks_parser, person_awards_parser, \
...@@ -53,6 +55,7 @@ from personParser import maindetails_parser, bio_parser, \ ...@@ -53,6 +55,7 @@ from personParser import maindetails_parser, bio_parser, \
from searchPersonParser import search_person_parser from searchPersonParser import search_person_parser
from utils import ParserBase from utils import ParserBase
PY_VERSION = sys.version_info[:2]
# Misc URLs # Misc URLs
imdbURL_movie = 'http://akas.imdb.com/title/tt%s/' imdbURL_movie = 'http://akas.imdb.com/title/tt%s/'
...@@ -73,14 +76,42 @@ class IMDbURLopener(FancyURLopener): ...@@ -73,14 +76,42 @@ class IMDbURLopener(FancyURLopener):
"""Fetch web pages and handle errors.""" """Fetch web pages and handle errors."""
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
FancyURLopener.__init__(self, *args, **kwargs) FancyURLopener.__init__(self, *args, **kwargs)
# Headers to add to every request.
# XXX: IMDb's web server doesn't like urllib-based programs, # XXX: IMDb's web server doesn't like urllib-based programs,
# so lets fake to be Mozilla. # so lets fake to be Mozilla.
# Wow! I'm shocked by my total lack of ethic! <g> # Wow! I'm shocked by my total lack of ethic! <g>
self.addheaders = [('User-agent', 'Mozilla/5.0')] self.set_header('User-agent', 'Mozilla/5.0')
# This class is used also to perform "Exact Primary [Title|Name]" # XXX: This class is used also to perform "Exact Primary
# searches, and so by default the cookie is set. # [Title|Name]" searches, and so by default the cookie is set.
c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu) c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu)
self.addheaders.append(('Cookie', c_header)) self.set_header('Cookie', c_header)
def get_proxy(self):
"""Return the used proxy, or an empty string."""
return self.proxies.get('http', '')
def set_proxy(self, proxy):
"""Set the proxy."""
if not proxy:
if self.proxies.has_key('http'):
del self.proxies['http']
else:
if not proxy.lower().startswith('http://'):
proxy = 'http://%s' % proxy
self.proxies['http'] = proxy
def set_header(self, header, value, _overwrite=True):
"""Set a default header."""
if _overwrite:
self.del_header(header)
self.addheaders.append((header, value))
def del_header(self, header):
"""Remove a default header."""
for index in xrange(len(self.addheaders)):
if self.addheaders[index][0] == header:
del self.addheaders[index]
break
def retrieve_unicode(self, url, size=-1): def retrieve_unicode(self, url, size=-1):
"""Retrieves the given URL, and returns a unicode string, """Retrieves the given URL, and returns a unicode string,
...@@ -89,11 +120,15 @@ class IMDbURLopener(FancyURLopener): ...@@ -89,11 +120,15 @@ class IMDbURLopener(FancyURLopener):
encode = None encode = None
try: try:
if size != -1: if size != -1:
self.addheader('Range', 'bytes=0-%d' % size) self.set_header('Range', 'bytes=0-%d' % size)
uopener = self.open(url) uopener = self.open(url)
content = uopener.read(size=size) kwds = {}
if PY_VERSION > (2, 3):
kwds['size'] = size
content = uopener.read(**kwds)
# Maybe the server is so nice to tell us the charset...
server_encode = uopener.info().getparam('charset') server_encode = uopener.info().getparam('charset')
# look at the content-type HTML meta tag. # Otherwise, look at the content-type HTML meta tag.
if server_encode is None and content: if server_encode is None and content:
first_bytes = content[:512] first_bytes = content[:512]
begin_h = first_bytes.find('text/html; charset=') begin_h = first_bytes.find('text/html; charset=')
...@@ -109,19 +144,18 @@ class IMDbURLopener(FancyURLopener): ...@@ -109,19 +144,18 @@ class IMDbURLopener(FancyURLopener):
pass pass
uopener.close() uopener.close()
if size != -1: if size != -1:
for index in xrange(len(self.addheaders)): self.del_header('Range')
if self.addheaders[index][0] == 'Range':
del self.addheaders[index]
self.close() self.close()
except IOError, e: except IOError, e:
if size != -1: if size != -1:
for index in xrange(len(self.addheaders)): # Ensure that the Range header is removed.
if self.addheaders[index][0] == 'Range': self.del_header('Range')
del self.addheaders[index]
raise IMDbDataAccessError, {'errcode': e.errno, raise IMDbDataAccessError, {'errcode': e.errno,
'errmsg': str(e.strerror), 'errmsg': str(e.strerror),
'url': url, 'url': url,
'proxy': self.proxies.get('http', '')} 'proxy': self.get_proxy(),
'exception type': 'IOError',
'original exception': e}
if encode is None: if encode is None:
encode = 'latin_1' encode = 'latin_1'
# The detection of the encoding is error prone... # The detection of the encoding is error prone...
...@@ -135,14 +169,19 @@ class IMDbURLopener(FancyURLopener): ...@@ -135,14 +169,19 @@ class IMDbURLopener(FancyURLopener):
'errcode': errcode, 'errcode': errcode,
'errmsg': errmsg, 'errmsg': errmsg,
'headers': headers, 'headers': headers,
'proxy': self.proxies.get('http', '')} 'error type': 'http_error_default',
'proxy': self.get_proxy()}
def open_unknown(self, fullurl, data=None): def open_unknown(self, fullurl, data=None):
raise IMDbDataAccessError, {'fullurl': fullurl, raise IMDbDataAccessError, {'fullurl': fullurl,
'data': str(data), 'data': str(data),
'proxy': self.proxies.get('http', '')} 'error type': 'open_unknown',
'proxy': self.get_proxy()}
def open_unknown_proxy(self, proxy, fullurl, data=None): def open_unknown_proxy(self, proxy, fullurl, data=None):
raise IMDbDataAccessError, {'proxy': str(proxy), raise IMDbDataAccessError, {'proxy': str(proxy),
'fullurl': fullurl, 'fullurl': fullurl,
'error type': 'open_unknown_proxy',
'data': str(data)} 'data': str(data)}
...@@ -151,12 +190,11 @@ class IMDbHTTPAccessSystem(IMDbBase): ...@@ -151,12 +190,11 @@ class IMDbHTTPAccessSystem(IMDbBase):
accessSystem = 'http' accessSystem = 'http'
urlOpener = IMDbURLopener()
def __init__(self, isThin=0, adultSearch=1, proxy=-1, def __init__(self, isThin=0, adultSearch=1, proxy=-1,
*arguments, **keywords): *arguments, **keywords):
"""Initialize the access system.""" """Initialize the access system."""
IMDbBase.__init__(self, *arguments, **keywords) IMDbBase.__init__(self, *arguments, **keywords)
self.urlOpener = IMDbURLopener()
# When isThin is set, we're parsing the "maindetails" page # When isThin is set, we're parsing the "maindetails" page
# of a movie (instead of the "combined" page) and movie/person # of a movie (instead of the "combined" page) and movie/person
# references are not collected if no defaultModFunct is provided. # references are not collected if no defaultModFunct is provided.
...@@ -171,7 +209,8 @@ class IMDbHTTPAccessSystem(IMDbBase): ...@@ -171,7 +209,8 @@ class IMDbHTTPAccessSystem(IMDbBase):
from imdb.utils import modNull from imdb.utils import modNull
self._defModFunct = modNull self._defModFunct = modNull
self.do_adult_search(adultSearch)