Commit 6d743aa2 authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 4.7.0

parent e82bc4f6
Metadata-Version: 1.0
Name: IMDbPY
Version: 4.6
Version: 4.7
Summary: Python package to access the IMDb's database
Home-page: http://imdbpy.sf.net/
Author: Davide Alberani
......
......@@ -53,7 +53,7 @@ docs/README.users
docs/README.utf8
docs/TODO.txt
docs/imdbpy.cfg
docs/imdbpy46.dtd
docs/imdbpy47.dtd
docs/imdbpyPowered.png
docs/imdbpyico.png
docs/imdbpyico.xpm
......
Metadata-Version: 1.0
Name: IMDbPY
Version: 4.6
Version: 4.7
Summary: Python package to access the IMDb's database
Home-page: http://imdbpy.sf.net/
Author: Davide Alberani
......
......@@ -26,10 +26,12 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import os, sys, getopt, time, re, warnings
try: import cPickle as pickle
except ImportError: import pickle
try: from hashlib import md5
except ImportError: from md5 import md5
from gzip import GzipFile
from types import UnicodeType
from imdb.parser.sql.dbschema import *
from imdb.parser.sql.dbschema import *
from imdb.parser.sql import get_movie_data, soundex
from imdb.utils import analyze_title, analyze_name, date_and_notes, \
build_name, build_title, normalizeName, normalizeTitle, _articles, \
......@@ -116,7 +118,8 @@ MYSQLFORCEMYISAM_OPTS = ['-e',
'AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;']
MYSQLINNODB_OPTS = ['-e',
'AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;',
'-e', 'END:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;']
'-e',
'BEFORE_INDEXES:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;']
SQLSERVER_OPTS = ['-e', 'BEFORE_EVERY_TODB:SET IDENTITY_INSERT %(table)s ON;',
'-e', 'AFTER_EVERY_TODB:SET IDENTITY_INSERT %(table)s OFF;']
SQLITE_OPTS = ['-e', 'BEGIN:PRAGMA synchronous = OFF;',
......@@ -975,7 +978,8 @@ class MoviesCache(_BaseCache):
self.sqlstr, self.converter = createSQLstr(Title, ('id', 'title',
'imdbIndex', 'kindID', 'productionYear',
'imdbID', 'phoneticCode', 'episodeOfID',
'seasonNr', 'episodeNr', 'seriesYears'))
'seasonNr', 'episodeNr', 'seriesYears',
'md5sum'))
def populate(self):
print ' * POPULATING %s...' % self.className
......@@ -1048,7 +1052,8 @@ class MoviesCache(_BaseCache):
soundex = title_soundex(title)
lapp((v, title, tget('imdbIndex'), KIND_IDS[kind],
tget('year'), None, soundex, episodeOf,
tget('season'), tget('episode'), tget('series years')))
tget('season'), tget('episode'), tget('series years'),
md5(k).hexdigest()))
self._runCommand(l)
def _runCommand(self, dataList):
......@@ -1079,7 +1084,7 @@ class PersonsCache(_BaseCache):
self._id_for_custom_q = 'PERSONS'
self.sqlstr, self.converter = createSQLstr(Name, ['id', 'name',
'imdbIndex', 'imdbID', 'namePcodeCf',
'namePcodeNf', 'surnamePcode'])
'namePcodeNf', 'surnamePcode', 'md5sum'])
def populate(self):
print ' * POPULATING PersonsCache...'
......@@ -1117,7 +1122,8 @@ class PersonsCache(_BaseCache):
name = tget('name')
namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name)
lapp((v, name, tget('imdbIndex'), None,
namePcodeCf, namePcodeNf, surnamePcode))
namePcodeCf, namePcodeNf, surnamePcode,
md5(k).hexdigest()))
if not CSV_DIR:
CURS.executemany(self.sqlstr, self.converter(l))
else:
......@@ -1135,7 +1141,7 @@ class CharactersCache(_BaseCache):
self._id_for_custom_q = 'CHARACTERS'
self.sqlstr, self.converter = createSQLstr(CharName, ['id', 'name',
'imdbIndex', 'imdbID', 'namePcodeNf',
'surnamePcode'])
'surnamePcode', 'md5sum'])
def populate(self):
print ' * POPULATING CharactersCache...'
......@@ -1174,7 +1180,7 @@ class CharactersCache(_BaseCache):
namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name,
character=True)
lapp((v, name, tget('imdbIndex'), None,
namePcodeCf, surnamePcode))
namePcodeCf, surnamePcode, md5(k).hexdigest()))
if not CSV_DIR:
CURS.executemany(self.sqlstr, self.converter(l))
else:
......@@ -1192,7 +1198,7 @@ class CompaniesCache(_BaseCache):
self._id_for_custom_q = 'COMPANIES'
self.sqlstr, self.converter = createSQLstr(CompanyName, ['id', 'name',
'countryCode', 'imdbID', 'namePcodeNf',
'namePcodeSf'])
'namePcodeSf', 'md5sum'])
def populate(self):
print ' * POPULATING CharactersCache...'
......@@ -1233,7 +1239,8 @@ class CompaniesCache(_BaseCache):
country = tget('country')
if k != name:
namePcodeSf = soundex(k)
lapp((v, name, country, None, namePcodeNf, namePcodeSf))
lapp((v, name, country, None, namePcodeNf, namePcodeSf,
md5(k).hexdigest()))
if not CSV_DIR:
CURS.executemany(self.sqlstr, self.converter(l))
else:
......@@ -1585,7 +1592,8 @@ def doAkaNames():
try: fp = SourceFile('aka-names.list.gz', start=AKAN_START)
except IOError: return
sqldata = SQLData(table=AkaName, cols=['personID', 'name', 'imdbIndex',
'namePcodeCf', 'namePcodeNf', 'surnamePcode'])
'namePcodeCf', 'namePcodeNf', 'surnamePcode',
'md5sum'])
for line in fp:
if line and line[0] != ' ':
if line[0] == '\n': continue
......@@ -1602,7 +1610,8 @@ def doAkaNames():
name = name_dict.get('name')
namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name)
sqldata.add((pid, name, name_dict.get('imdbIndex'),
namePcodeCf, namePcodeNf, surnamePcode))
namePcodeCf, namePcodeNf, surnamePcode,
md5(line).hexdigest()))
if count % 10000 == 0:
print 'SCANNING akanames:', _(line)
count += 1
......@@ -1618,6 +1627,7 @@ class AkasMoviesCache(MoviesCache):
def __init__(self, *args, **kdws):
MoviesCache.__init__(self, *args, **kdws)
self.flushEvery = 50000
self._mapsIDsToTitles = True
self.notes = {}
self.ids = {}
self._table_name = tableName(AkaTitle)
......@@ -1625,7 +1635,7 @@ class AkasMoviesCache(MoviesCache):
self.sqlstr, self.converter = createSQLstr(AkaTitle, ('id', 'movieID',
'title', 'imdbIndex', 'kindID', 'productionYear',
'phoneticCode', 'episodeOfID', 'seasonNr',
'episodeNr', 'note'))
'episodeNr', 'note', 'md5sum'))
def flush(self, *args, **kwds):
# Preserve consistency of ForeignKey.
......@@ -1644,8 +1654,10 @@ class AkasMoviesCache(MoviesCache):
# id of the referred title.
original_title_id = self.ids.get(the_id) or 0
new_item = [the_id, original_title_id]
new_item += item[1:-1]
md5sum = item[-1]
new_item += item[1:-2]
new_item.append(self.notes.get(the_id))
new_item.append(md5sum)
new_dataListapp(tuple(new_item))
new_dataList.reverse()
if not CSV_DIR:
......
......@@ -21,6 +21,18 @@ of help, and also for the wonderful http://bitbucket.org)
Below, a list of persons who contributed with bug reports, small
patches and hints (kept in a reverse order since IMDbPY 4.5):
* Ramusus for a lot of precious bug reports.
* Laurent Vergne for a hint about InnoDB, MyISAM and foreign keys.
* Israel Fruch for patches to support the new set of parsers.
* Inf3cted MonkeY, for a bug report about 'vote details'.
* Alexmipego, for suggesting to add a md5sum to titles and names.
* belgabortm for a bug report about movies with multiple 'countries'.
* David Kaufman for an idea to make the 'update' method more robust.
* Dustin Wyatt for a bug with SQLite of Python 2.6.
......
Changelog for IMDbPY
====================
* What's the new in release 4.7 "Saw VI" (23 Jan 2011)
[http]
- first fixes for the new set of parsers.
- first changes to support the new set of web pages.
- fix for lists of uncategorized episodes.
- fix for movies with multiple countries.
- fix for the currentRole property.
- more robust handling for vote details.
[mobile]
- first fixes for the new set of parsers.
[sql]
- the tables containing titles and names (and akas) now
include a 'md5sum' column calculated on the "long imdb canonical title/name".
* What's the new in release 4.6 "The Road" (19 Jun 2010)
[general]
- introduced the 'full-size cover url' and 'full-size headshot'
......
......@@ -2,13 +2,32 @@
IMDb's web site redesign
========================
On 19 February 2007, IMDb introduced a complete redesign of their
web site.
On September 2010 the IMDb web pages had a major redesign.
With IMDbPY 4.7 we're trying to parse the new web pages,
but it will take some time before all the bugs are fixed.
Since release 3.0, IMDbPY uses a new account to access the IMDb
web site, parsing the new layout.
Any help (fixing parsers or simple bug reports) is greatly
appreciated.
Older version still access the old layout, so they are still (more
or less) working; obviously only the new layout is supported by
now on.
Beware that:
- the "httpThin" data access method is badly broken and
probably it will not fixed.
- the "mobile" data access method can be partially broken,
and will be fixed: please report any problem.
- some of the information in these keys could be somewhat
ruined: soundtrack, awards, episodes rating, faqs.
- information about series were not extensively tested.
- it's possible that some information will be missing, like
"in development" movies.
The above problems, with the exception of "httpThin" will be
fixed in future releases.
Notes about the code: we have a very powerful and efficient
parsing infrastructure, but after many releases and so many
changes of the IMDb pages, some of the main parsers are showing
their age. So, parsers for main information about movies and
persons should be probably rewritten from scratch, and the same
applies to helper functions like "build_person" and "build_movie"
in imdb.parser.http.utils.
......@@ -8,6 +8,7 @@ NOTE: it's always time to clean the code! <g>
[general]
* improve the logging facility.
* ability to silent warnings/logging.
* create mobile versions for smart phones (with GUI).
* Write better summary() methods for Movie, Person, Character
and Company classes.
......
......@@ -66,6 +66,7 @@
| episode-title
| external-reviews
| faqs
| film-length
| film-negative-format
| full-size-cover-url
| genres
......@@ -451,6 +452,8 @@
<!ATTLIST demographic %common.attrs;>
<!ELEMENT episodes-rating (item)*>
<!ATTLIST episodes-rating %common.attrs;>
<!ELEMENT film-length (item)*>
<!ATTLIST film-length %common.attrs;>
<!ELEMENT film-negative-format (item)*>
<!ATTLIST film-negative-format %common.attrs;>
<!ELEMENT genres (item)*>
......
......@@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems']
__version__ = VERSION = '4.6'
__version__ = VERSION = '4.7'
# Import compatibility module (importing it is enough).
import _compat
......@@ -179,6 +179,10 @@ def IMDb(accessSystem=None, *arguments, **keywords):
from parser.http import IMDbHTTPAccessSystem
return IMDbHTTPAccessSystem(*arguments, **keywords)
elif accessSystem in ('httpThin', 'webThin', 'htmlThin'):
import logging
logging.warn('httpThin is badly broken and' \
' will not be fixed; please switch' \
' to "http" or "mobile"')
from parser.http import IMDbHTTPAccessSystem
return IMDbHTTPAccessSystem(isThin=1, *arguments, **keywords)
elif accessSystem in ('mobile',):
......
No preview for this file type
No preview for this file type
......@@ -109,6 +109,10 @@ _old_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZ
_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1'
_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
# imdbpy2010 account.
#_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI='
#_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A=='
class _FakeURLOpener(object):
"""Fake URLOpener object, used to return empty strings instead of
......
......@@ -225,8 +225,8 @@ class DOMHTMLMovieParser(DOMParserBase):
postprocess=lambda x: x.strip()),
Attribute(key="countries",
path="./h5[starts-with(text(), " \
"'Countr')]/..//a/text()",
postprocess=makeSplitter(sep='\n')),
"'Countr')]/../div[@class='info-content']//text()",
postprocess=makeSplitter('|')),
Attribute(key="language",
path="./h5[starts-with(text(), " \
"'Language')]/..//text()",
......@@ -541,11 +541,13 @@ class DOMHTMLPlotParser(DOMParserBase):
def _process_award(x):
award = {}
award['award'] = x.get('award').strip()
if not award['award']:
return {}
award['year'] = x.get('year').strip()
if award['year'] and award['year'].isdigit():
award['year'] = int(award['year'])
award['result'] = x.get('result').strip()
award['award'] = x.get('award').strip()
category = x.get('category').strip()
if category:
award['category'] = category
......@@ -649,6 +651,8 @@ class DOMHTMLAwardsParser(DOMParserBase):
assigner = self.xpath(dom, "//a/text()")[0]
for entry in data[key]:
if not entry.has_key('name'):
if not entry:
continue
# this is an award, not a recipient
entry['assigner'] = assigner.strip()
# find the recipients
......@@ -996,8 +1000,10 @@ class DOMHTMLRatingsParser(DOMParserBase):
if votes:
nd['number of votes'] = {}
for i in xrange(1, 11):
nd['number of votes'][int(votes[i]['ordinal'])] = \
int(votes[i]['votes'].replace(',', ''))
_ordinal = int(votes[i]['ordinal'])
_strvts = votes[i]['votes'] or '0'
nd['number of votes'][_ordinal] = \
int(_strvts.replace(',', ''))
mean = data.get('mean and median', '')
if mean:
means = self.re_means.findall(mean)
......@@ -1699,10 +1705,14 @@ class DOMHTMLEpisodesParser(DOMParserBase):
try: season_key = int(season_key)
except: pass
nd[season_key] = {}
ep_counter = 1
for episode in data[key]:
if not episode: continue
episode_key = episode.get('episode')
if episode_key is None: continue
if not isinstance(episode_key, int):
episode_key = ep_counter
ep_counter += 1
cast_key = 'Season %s, Episode %s:' % (season_key,
episode_key)
if data.has_key(cast_key):
......
......@@ -63,85 +63,131 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
_birth_attrs = [Attribute(key='birth date',
path={
'day': "./div/a[starts-with(@href, " \
'day': ".//a[starts-with(@href, " \
"'/date/')]/text()",
'year': "./div/a[starts-with(@href, " \
'year': ".//a[starts-with(@href, " \
"'/search/name?birth_year=')]/text()"
},
postprocess=build_date),
Attribute(key='birth notes',
path="./div/a[starts-with(@href, " \
Attribute(key='birth place',
path=".//a[starts-with(@href, " \
"'/search/name?birth_place=')]/text()")]
_death_attrs = [Attribute(key='death date',
path={
'day': "./div/a[starts-with(@href, " \
'day': ".//a[starts-with(@href, " \
"'/date/')]/text()",
'year': "./div/a[starts-with(@href, " \
"'/search/name?death_date=')]/text()"
'year': ".//a[starts-with(@href, " \
"'/search/name?death_year=')]/text()"
},
postprocess=build_date),
Attribute(key='death notes',
path="./div/text()",
# TODO: check if this slicing is always correct
postprocess=lambda x: x.strip()[2:])]
Attribute(key='death place',
path=".//a[starts-with(@href, " \
"'/search/name?death_place=')]/text()")]
_film_attrs = [Attribute(key=None,
multi=True,
path={
'link': "./a[1]/@href",
'title': ".//text()",
'status': "./i/a//text()",
'roleID': "./div[@class='_imdbpyrole']/@roleid"
'link': "./b/a[1]/@href",
'title': "./b/a[1]/text()",
'notes': "./b/following-sibling::text()",
'year': "./span[@class='year_column']/text()",
'status': "./a[@class='in_production']/text()",
'rolesNoChar': './/br/following-sibling::text()',
'chrRoles': "./a[@imdbpyname]/@imdbpyname",
'roleID': "./a[starts-with(@href, '/character/')]/@href"
},
postprocess=lambda x:
build_movie(x.get('title') or u'',
year=x.get('year'),
movieID=analyze_imdbid(x.get('link') or u''),
roleID=(x.get('roleID') or u'').split('/'),
rolesNoChar=(x.get('rolesNoChar') or u'').strip(),
chrRoles=(x.get('chrRoles') or u'').strip(),
additionalNotes=x.get('notes'),
roleID=(x.get('roleID') or u''),
status=x.get('status') or None))]
extractors = [
Extractor(label='page title',
path="//title",
Extractor(label='name',
path="//h1[@class='header']",
attrs=Attribute(key='name',
path="./text()",
path=".//text()",
postprocess=lambda x: analyze_name(x,
canonical=1))),
canonical=1))),
Extractor(label='birth info',
path="//div[h5='Date of Birth:']",
path="//div[h4='Born:']",
attrs=_birth_attrs),
Extractor(label='death info',
path="//div[h5='Date of Death:']",
path="//div[h4='Died:']",
attrs=_death_attrs),
Extractor(label='headshot',
path="//a[@name='headshot']",
path="//td[@id='img_primary']/a",
attrs=Attribute(key='headshot',
path="./img/@src")),
Extractor(label='akas',
path="//div[h5='Alternate Names:']",
path="//div[h4='Alternate Names:']",
attrs=Attribute(key='akas',
path="./div/text()",
postprocess=lambda x: x.strip().split(' | '))),
path="./text()",
postprocess=lambda x: x.strip().split(' '))),
Extractor(label='filmography',
group="//div[@class='filmo'][h5]",
group_key="./h5/a[@name]/text()",
group_key_normalize=lambda x: x.lower()[:-1],
path="./ol/li",
attrs=_film_attrs)
group="//div[starts-with(@id, 'filmo-head-')]",
group_key="./a[@name]/text()",
group_key_normalize=lambda x: x.lower().replace(': ', ' '),
path="./following-sibling::div[1]" \
"/div[starts-with(@class, 'filmo-row')]",
attrs=_film_attrs),
Extractor(label='indevelopment',
path="//div[starts-with(@class,'devitem')]",
attrs=Attribute(key='in development',
multi=True,
path={
'link': './a/@href',
'title': './a/text()'
},
postprocess=lambda x:
build_movie(x.get('title') or u'',
movieID=analyze_imdbid(x.get('link') or u''),
roleID=(x.get('roleID') or u'').split('/'),
status=x.get('status') or None)))
]
preprocessors = [
# XXX: check that this doesn't cut "status" or other info...
(re.compile(r'<br>(\.\.\.| ?).+?</li>', re.I | re.M | re.S),
'</li>'),
(_reRoles, _manageRoles)]
preprocessors = [('<div class="clear"/> </div>', ''),
('<br/>', '<br />'),
(re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'),
r'\1 imdbpyname="\2@@">\2</a>')]
def postprocess_data(self, data):
for what in 'birth date', 'death date':
if what in data and not data[what]:
del data[what]
# XXX: the code below is for backwards compatibility
# probably could be removed
for key in data.keys():
if key.startswith('actor '):
if not data.has_key('actor'):
data['actor'] = []
data['actor'].extend(data[key])
del data[key]
if key.startswith('actress '):
if not data.has_key('actress'):
data['actress'] = []
data['actress'].extend(data[key])
del data[key]
if key.startswith('self '):
if not data.has_key('self'):
data['self'] = []
data['self'].extend(data[key])
del data[key]
if key == 'birth place':
data['birth notes'] = data[key]
del data[key]
if key == 'death place':
data['death notes'] = data[key]
del data[key]
return data
......@@ -181,6 +227,10 @@ class DOMHTMLBioParser(DOMParserBase):
# TODO: check if this slicing is always correct
postprocess=lambda x: u''.join(x).strip()[2:])]
extractors = [
Extractor(label='headshot',
path="//a[@name='headshot']",
attrs=Attribute(key='headshot',
path="./img/@src")),
Extractor(label='birth info',
path="//div[h5='Date of Birth']",
attrs=_birth_attrs),
......
......@@ -262,14 +262,20 @@ def build_person(txt, personID=None, billingPos=None,
return person
_re_chrIDs = re.compile('[0-9]{7}')
_b_m_logger = logging.getLogger('imdbpy.parser.http.build_movie')
# To shrink spaces.
re_spaces = re.compile(r'\s+')
def build_movie(txt, movieID=None, roleID=None, status=None,
accessSystem='http', modFunct=None, _parsingCharacter=False,
_parsingCompany=False):
_parsingCompany=False, year=None, chrRoles=None,
rolesNoChar=None, additionalNotes=None):
"""Given a string as normally seen on the "categorized" page of
a person on the IMDb's web site, returns a Movie instance."""
# FIXME: Oook, lets face it: build_movie and build_person are now
# two horrible sets of patches to support the new IMDb design. They
# must be rewritten from scratch.
if _parsingCharacter:
_defSep = ' Played by '
elif _parsingCompany:
......@@ -291,6 +297,8 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
title = title[:-14] + ' (mini)'
# Try to understand where the movie title ends.
while True:
if year:
break
if title[-1:] != ')':
# Ignore the silly "TV Series" notice.
if title[-9:] == 'TV Series':
......@@ -319,12 +327,24 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
if notes: notes = '%s %s' % (title[nidx:], notes)
else: notes = title[nidx:]
title = title[:nidx].rstrip()
if year:
year = year.strip()
if title[-1] == ')':
fpIdx = title.rfind('(')
if fpIdx != -1:
if notes: notes = '%s %s' % (title[fpIdx:], notes)
else: notes = title[fpIdx:]
title = title[:fpIdx].rstrip()
title = u'%s (%s)' % (title, year)
if _parsingCharacter and roleID and not role:
roleID = None
if not roleID:
roleID = None
elif len(roleID) == 1:
roleID = roleID[0]
if not role and chrRoles and isinstance(roleID, (str, unicode)):
roleID = _re_chrIDs.findall(roleID)
role = ' / '.join(filter(None, chrRoles.split('@@')))
# Manages multiple roleIDs.
if isinstance(roleID, list):
tmprole = role.split('/')
......@@ -355,13 +375,29 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
movieID = str(movieID)
if (not title) or (movieID is None):
_b_m_logger.error('empty title or movieID for "%s"', txt)
if rolesNoChar:
rolesNoChar = filter(None, [x.strip() for x in rolesNoChar.split('/')])
if not role:
role = []
elif not isinstance(role, list):
role = [role]
role += rolesNoChar
notes = notes.strip()
if additionalNotes:
additionalNotes = re_spaces.sub(' ', additionalNotes).strip()
if notes:
notes += u' '
notes += additionalNotes
m = Movie(title=title, movieID=movieID, notes=notes, currentRole=role,
roleID=roleID, roleIsPerson=_parsingCharacter,
modFunct=modFunct, accessSystem=accessSystem)
if roleNotes and len(roleNotes) == len(roleID):
for idx, role in enumerate(m.currentRole):
if roleNotes[idx]:
role.notes = roleNotes[idx]
try:
if roleNotes[idx]:
role.notes = roleNotes[idx]
except IndexError:
break
# Status can't be checked here, and must be detected by the parser.
if status:
m['status'] = status
......@@ -468,8 +504,10 @@ class DOMParserBase(object):
# converted to title=""Family Guy"" and this confuses BeautifulSoup.
if self.usingModule == 'beautifulsoup':
html_string = html_string.replace('""', '"')
#print html_string.encode('utf8')
if html_string:
dom = self.get_dom(html_string)
#print self.tostring(dom).encode('utf8')
try:
dom = self.preprocess_dom(dom)
except Exception, e:
......
......@@ -52,6 +52,10 @@ re_imdbID = re.compile(r'(?<=nm|tt|ch)([0-9]{7})\b')
# movie AKAs.
re_makas = re.compile('(<p class="find-aka">.*?</p>)')
# Remove episode numbers.
re_filmo_episodes = re.compile('<div class="filmo-episodes">.*?</div>',
re.M | re.I)
def _unHtml(s):
"""Return a string without tags and no multiple spaces."""
......@@ -537,24 +541,33 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
if _parseChr: w = 'characterID'
else: w = 'personID'
raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
name = _unHtml(name[0])
name = _unHtml(name[0].replace(' - IMDb', ''))
if _parseChr:
name = name.replace('(Character)', '').strip()
name = name.replace('- Filmography by type', '').strip()
else:
name = name.replace('- Filmography by', '').strip()
r = analyze_name(name, canonical=not _parseChr)
for dKind in ('birth', 'death'):
date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(),
('<a class', '</div>', '<br/><br/>'), maxRes=1)
for dKind in ('Born', 'Died'):
date = _findBetween(s, '%s:</h4>' % dKind.capitalize(),
('<div class', '</div>', '<br/><br/>'), maxRes=1)
if date:
date = _unHtml(date[0])
if date:
date, notes = date_and_notes(date)
#date, notes = date_and_notes(date)
# TODO: fix to handle real names.
date_notes = date.split(' in ', 1)
notes = u''
date = date_notes[0]
if len(date_notes) == 2:
notes = date_notes[1]
dtitle = 'birth'
if dKind == 'Died':
dtitle = 'death'
if date:
r['%s date' % dKind] = date
r['%s date' % dtitle] = date
if notes:
r['%s notes' % dKind] = notes
r['%s notes' % dtitle] = notes
akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>',
'<br/><br/>'), maxRes=1)
if akas:
......@@ -569,18 +582,13 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
if hs: r['headshot'] = hs[0]
# Build a list of tuples such [('hrefLink', 'section name')]
workkind = _findBetween(s, '<div class="strip jump">', '</div>',
maxRes=1)
if workkind:
workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
else: