Commit 4e9cf685 authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 4.3

parent ffb60035
......@@ -242,7 +242,7 @@ if ('--mysql-force-myisam' in sys.argv[1:] and
if CSV_DIR:
if URIlower.startswith('mysql'):
CSV_LOAD_SQL = CSV_MYSQL
elif URIlower.startswith('postrges'):
elif URIlower.startswith('postgres'):
CSV_LOAD_SQL = CSV_PGSQL
elif URIlower.startswith('ibm'):
CSV_LOAD_SQL = CSV_DB2
......@@ -394,7 +394,7 @@ class CSVCursor(object):
tableToAddID = False
if tName in ('cast_info', 'movie_info', 'person_info',
'movie_companies', 'movie_link', 'aka_name',
'complete_cast'):
'complete_cast', 'movie_info_idx'):
tableToAddID = tName
if tName not in self._counters:
self._counters[tName] = 1
......
......@@ -10,6 +10,10 @@ I'd like to thank the following people for their help:
* H. Turgut Uyar for a number of bug reports and a lot of work on
the test-suite.
* Adeodato Simó for a bug report about the new imdb.com layout.
* Josh Harding for a bug report about the new imdb.com layout.
* Xavier Naidoo for a bug report about top250 and BeautifulSoup.
* Basil Shubin for hints about a new helper function.
......
Changelog for IMDbPY
====================
* What's the new in release 4.3 "Public Enemies" (18 Nov 2009)
[general]
- the installer now takes care of .mo files.
- introduced, in the helpers module, the functions keyToXML and
translateKey, useful to translate dictionary keys.
- support for smart guessing of the language of a movie title.
- updated the DTD.
[http]
- fixed a log of bugs introduced by the new IMDb.com design.
- nicer handling of HTTP 404 response code.
- fixed parsers for top250 and bottom100 lists.
- fixed a bug parsing AKAs.
- fixed misc bugs.
[mobile]
- removed duplicates in list of genres.
[sql]
- fixed a bug in the imdbpy2sql.py script using CSV files;
the 'movie_info_idx' and 'movie_keyword' were left
empty/with wrong data.
* What's the new in release 4.2 "Battlestar Galactica" (31 Aug 2009)
[general]
- the 'local' data access system is gone. See README.local.
......
......@@ -18,6 +18,8 @@ classes, as if they are dictionaries.
E.g.: you can translate "long-imdb-name" - the tag returned by
the call person.getAsXML('long imdb name') - but not "long imdb name"
directly.
To translate keys, you can use the helpers.translateKey function in
the 'helpers' module.
USAGE
......
<!--
XML Document Type Definition for IMDbPY 4.2.
XML Document Type Definition for IMDbPY 4.3.
http://imdbpy.sf.net/dtd/imdbpy42.dtd
http://imdbpy.sf.net/dtd/imdbpy43.dtd
Copyright 2009 H. Turgut Uyar <uyar@tekir.org>
2009 Davide Alberani <da@erlug.linux.it>
-->
......@@ -129,6 +130,10 @@
| series-writers
| series-years
| set-decoration
| smart-canonical-episode-title
| smart-canonical-series-title
| smart-canonical-title
| smart-long-imdb-canonical-title
| sound-clips
| sound-crew
| sound-mix
......@@ -695,6 +700,10 @@
<!ELEMENT rating (#PCDATA)>
<!ELEMENT series-title (#PCDATA)>
<!ELEMENT series-years (#PCDATA)>
<!ELEMENT smart-canonical-episode-title (#PCDATA)>
<!ELEMENT smart-canonical-series-title (#PCDATA)>
<!ELEMENT smart-canonical-title (#PCDATA)>
<!ELEMENT smart-long-imdb-canonical-title (#PCDATA)>
<!ELEMENT synopsis (#PCDATA)>
<!ELEMENT title (#PCDATA)>
<!ELEMENT top-250-rank (#PCDATA)>
......
......@@ -23,6 +23,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
from copy import deepcopy
from imdb import articles
from imdb.utils import analyze_title, build_title, canonicalTitle, \
flatten, _Container, cmpMovies
......@@ -93,6 +94,8 @@ class Movie(_Container):
'aka': 'akas',
'also known as': 'akas',
'country': 'countries',
'production country': 'countries',
'production countries': 'countries',
'genre': 'genres',
'runtime': 'runtimes',
'lang': 'languages',
......@@ -180,13 +183,40 @@ class Movie(_Container):
addkeys = []
if self.data.has_key('title'):
addkeys += ['canonical title', 'long imdb title',
'long imdb canonical title']
'long imdb canonical title',
'smart canonical title',
'smart long imdb canonical title']
if self.data.has_key('episode of'):
addkeys += ['long imdb episode title', 'series title',
'canonical series title', 'episode title',
'canonical episode title']
'canonical episode title',
'smart canonical series title',
'smart canonical episode title']
return addkeys
def guessLanguage(self):
"""Guess the language of the title of this movie; returns None
if there are no hints."""
lang = self.get('languages')
if lang:
lang = lang[0]
else:
country = self.get('countries')
if country:
lang = articles.COUNTRY_LANG.get(country[0])
return lang
def smartCanonicalTitle(self, title=None, lang=None):
"""Return the canonical title, guessing its language.
The title can be forces with the 'title' argument (internally
used) and the language can be forced with the 'lang' argument,
otherwise it's auto-detected."""
if title is None:
title = self.data.get('title', u'')
if lang is None:
lang = self.guessLanguage()
return canonicalTitle(title, lang=lang)
def _getitem(self, key):
"""Handle special keys."""
if self.data.has_key('episode of'):
......@@ -197,10 +227,15 @@ class Movie(_Container):
elif key == 'canonical series title':
ser_title = self.data['episode of']['title']
return canonicalTitle(ser_title)
elif key == 'smart canonical series title':
ser_title = self.data['episode of']['title']
return self.smartCanonicalTitle(ser_title)
elif key == 'episode title':
return self.data.get('title', u'')
elif key == 'canonical episode title':
return canonicalTitle(self.data.get('title', u''))
elif key == 'smart canonical episode title':
return self.smartCanonicalTitle(self.data.get('title', u''))
if self.data.has_key('title'):
if key == 'title':
return self.data['title']
......@@ -208,8 +243,13 @@ class Movie(_Container):
return build_title(self.data)
elif key == 'canonical title':
return canonicalTitle(self.data['title'])
elif key == 'smart canonical title':
return self.smartCanonicalTitle(self.data['title'])
elif key == 'long imdb canonical title':
return build_title(self.data, canonical=1)
elif key == 'smart long imdb canonical title':
return build_title(self.data, canonical=1,
lang=self.guessLanguage())
return None
def getID(self):
......
......@@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems']
__version__ = VERSION = '4.2'
__version__ = VERSION = '4.3svn20091117'
# Import compatibility module (importing it is enough).
import _compat
......
"""
articles module (imdb package).
This module provides functions and data to handle in a smart way
articles (in various languages) at the beginning of movie titles.
Copyright 2009 Davide Alberani <da@erlug.linux.it>
2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
# List of generic articles in many languages.
# XXX: Managing titles in a lot of different languages, a function to recognize
# an initial article can't be perfect; sometimes we'll stumble upon a short
# word that is an article in some language, but it's not in another; in these
# situations we have to choose if we want to interpret this little word
# as an article or not (remember that we don't know what the original language
# of the title was).
# Example: 'da' is an article in (I think) Dutch and it's used as an article
# even in some American slangs. Unfortunately it's also a preposition in
# Italian, and it's widely used in Mandarin (for whatever it means!).
# Running a script over the whole list of titles (and aliases), I've found
# that 'da' is used as an article only 23 times, and as another thing 298
# times, so I've decided to _always_ consider 'da' as a non article.
#
# Here is a list of words that are _never_ considered as articles, complete
# with the cound of times they are used in a way or another:
# 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99),
# 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70),
# 'da' (23 vs 298), "'n" (8 vs 12)
#
# I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56)
# I'm not sure what '-al' is, and so I've left it out...
#
# Generic list of articles in utf-8 encoding:
GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los',
'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os',
'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-',
'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines',
'\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9')
# Lists of articles separated by language. If possible, the list should
# be sorted by frequency (not very important, but...)
# If you want to add a list of articles for another language, mail it
# it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8
# encoded.
LANG_ARTICLES = {
'English': ('the', 'a', 'an'),
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
'uno'),
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos',
'unas'),
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
'Turkish': (), # Some languages doesn't have articles.
}
LANG_ARTICLESget = LANG_ARTICLES.get
# Maps a language to countries where it is the main language.
# If you want to add an entry for another language or country, mail it at
# lists.sourceforge.net .
_LANG_COUNTRIES = {
'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'),
'Italian': ('Italy',),
'Spanish': ('Spain', 'Mexico'),
'Portuguese': ('Portugal', 'Brazil'),
'Turkish': ('Turkey',),
#'German': ('Germany', 'East Germany', 'West Germany'),
#'French': ('France'),
}
# Maps countries to their main language.
COUNTRY_LANG = {}
for lang in _LANG_COUNTRIES:
for country in _LANG_COUNTRIES[lang]:
COUNTRY_LANG[country] = lang
def toUnicode(articles):
"""Convert a list of articles utf-8 encoded to unicode strings."""
return tuple([art.decode('utf_8') for art in articles])
def toDicts(articles):
"""Given a list of utf-8 encoded articles, build two dictionary (one
utf-8 encoded and another one with unicode keys) for faster matches."""
uArticles = toUnicode(articles)
return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles])
def addTrailingSpace(articles):
"""From the given list of utf-8 encoded articles, return two
lists (one utf-8 encoded and another one in unicode) where a space
is added at the end - if the last char is not ' or -."""
_spArticles = []
_spUnicodeArticles = []
for article in articles:
if article[-1] not in ("'", '-'):
article += ' '
_spArticles.append(article)
_spUnicodeArticles.append(article.decode('utf_8'))
return _spArticles, _spUnicodeArticles
# Caches.
_ART_CACHE = {}
_SP_ART_CACHE = {}
def articlesDictsForLang(lang):
"""Return dictionaries of articles specific for the given language, or the
default one if the language is not known."""
if lang in _ART_CACHE:
return _ART_CACHE[lang]
artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
_ART_CACHE[lang] = artDicts
return artDicts
def spArticlesForLang(lang):
"""Return lists of articles (plus optional spaces) specific for the
given language, or the default one if the language is not known."""
if lang in _SP_ART_CACHE:
return _SP_ART_CACHE[lang]
spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
_SP_ART_CACHE[lang] = spArticles
return spArticles
......@@ -25,12 +25,17 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import re
from cgi import escape
import gettext
from gettext import gettext as _
gettext.textdomain('imdbpy')
# The modClearRefs can be used to strip names and titles references from
# the strings in Movie and Person objects.
from imdb.utils import modClearRefs, re_titleRef, re_nameRef, re_characterRef
from imdb.utils import modClearRefs, re_titleRef, re_nameRef, \
re_characterRef, _tagAttr
from imdb import IMDb, imdbURL_movie_base, imdbURL_person_base, \
imdbURL_character_base
import imdb.locale
from imdb.Movie import Movie
from imdb.Person import Person
from imdb.Character import Character
......@@ -349,3 +354,13 @@ def fullSizeCoverURL(obj):
return _re_clearURL.sub('', coverUrl)
def keyToXML(key):
"""Return a key (the ones used to access information in Movie and
other classes instances) converted to the style of the XML output."""
return _tagAttr(key, '')[0]
def translateKey(key):
"""Translate a given key."""
return _(keyToXML(key))
......@@ -99,6 +99,18 @@ _cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxE
_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
class _FakeURLOpener(object):
"""Fake URLOpener object, used to return empty strings instead of
errors.
"""
def __init__(self, url, headers):
self.url = url
self.headers = headers
def read(self, *args, **kwds): return ''
def close(self, *args, **kwds): pass
def info(self, *args, **kwds): return self.headers
class IMDbURLopener(FancyURLopener):
"""Fetch web pages and handle errors."""
def __init__(self, *args, **kwargs):
......@@ -190,9 +202,12 @@ class IMDbURLopener(FancyURLopener):
# The detection of the encoding is error prone...
warnings.warn('Unable to detect the encoding of the retrieved '
'page [%s]; falling back to default latin1.' % encode)
##print unicode(content, encode, 'replace').encode('utf8')
return unicode(content, encode, 'replace')
def http_error_default(self, url, fp, errcode, errmsg, headers):
if errcode == 404:
return _FakeURLOpener(url, headers)
raise IMDbDataAccessError, {'url': 'http:%s' % url,
'errcode': errcode,
'errmsg': errmsg,
......
......@@ -77,7 +77,7 @@ class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
Extractor(label='akas',
path="//div[h5='Alternate Names:']",
attrs=Attribute(key='akas',
path="./text()",
path="./p//text()",
postprocess=lambda x: x.strip().split(' / '))),
Extractor(label='filmography',
......
......@@ -209,20 +209,20 @@ class DOMHTMLMovieParser(DOMParserBase):
attrs=[
Attribute(key="plot summary",
path="./h5[starts-with(text(), " \
"'Plot:')]/../text()",
"'Plot:')]/../p/text()",
postprocess=lambda x: \
x.strip().rstrip('|').rstrip()),
Attribute(key="aspect ratio",
path="./h5[starts-with(text()," \
" 'Aspect')]/../text()",
" 'Aspect')]/../p/text()",
postprocess=lambda x: x.strip()),
Attribute(key="mpaa",
path="./h5/a[starts-with(text()," \
" 'MPAA')]/../../text()",
" 'MPAA')]/../../p/text()",
postprocess=lambda x: x.strip()),
Attribute(key="countries",
path="./h5[starts-with(text(), " \
"'Countr')]/../a/text()",
"'Countr')]/..//a/text()",
postprocess=makeSplitter(sep='\n')),
Attribute(key="language",
path="./h5[starts-with(text(), " \
......@@ -239,11 +239,11 @@ class DOMHTMLMovieParser(DOMParserBase):
# Collects akas not encosed in <i> tags.
Attribute(key='other akas',
path="./h5[starts-with(text(), " \
"'Also Known As')]/../text()",
"'Also Known As')]/../p/text()",
postprocess=makeSplitter(sep='::')),
Attribute(key='runtimes',
path="./h5[starts-with(text(), " \
"'Runtime')]/../text()",
"'Runtime')]/../p/text()",
postprocess=makeSplitter()),
Attribute(key='certificates',
path="./h5[starts-with(text(), " \
......@@ -251,21 +251,21 @@ class DOMHTMLMovieParser(DOMParserBase):
postprocess=makeSplitter('Certification:')),
Attribute(key='number of seasons',
path="./h5[starts-with(text(), " \
"'Seasons')]/../text()",
"'Seasons')]/..//text()",
postprocess=lambda x: x.count('|') + 1),
Attribute(key='original air date',
path="./h5[starts-with(text(), " \
"'Original Air Date')]/../text()"),
"'Original Air Date')]/../p/text()"),
Attribute(key='tv series link',
path="./h5[starts-with(text(), " \
"'TV Series')]/../a/@href"),
"'TV Series')]/..//a/@href"),
Attribute(key='tv series title',
path="./h5[starts-with(text(), " \
"'TV Series')]/../a/text()")
"'TV Series')]/..//a/text()")
]),
Extractor(label='creator',
path="//h5[starts-with(text(), 'Creator')]/../a",
path="//h5[starts-with(text(), 'Creator')]/..//a",
attrs=Attribute(key='creator', multi=True,
path={'name': "./text()",
'link': "./@href"},
......@@ -275,7 +275,7 @@ class DOMHTMLMovieParser(DOMParserBase):
)),
Extractor(label='thin writer',
path="//h5[starts-with(text(), 'Writer')]/../a",
path="//h5[starts-with(text(), 'Writer')]/..//a",
attrs=Attribute(key='thin writer', multi=True,
path={'name': "./text()",
'link': "./@href"},
......@@ -285,7 +285,7 @@ class DOMHTMLMovieParser(DOMParserBase):
)),
Extractor(label='thin director',
path="//h5[starts-with(text(), 'Director')]/../a",
path="//h5[starts-with(text(), 'Director')]/..//a",
attrs=Attribute(key='thin director', multi=True,
path={'name': "./text()",
'link': "@href"},
......@@ -401,7 +401,10 @@ class DOMHTMLMovieParser(DOMParserBase):
obj.accessSystem = self._as
obj.modFunct = self._modFunct
if 'akas' in data or 'other akas' in data:
data['akas'] = data.get('other akas', []) + data.get('akas', [])
other_akas = data.get('akas')
if not other_akas:
other_akas = []
data['akas'] = data.get('other akas', []) + other_akas
if 'other akas' in data:
del data['other akas']
if 'runtimes' in data:
......@@ -1334,7 +1337,7 @@ class DOMHTMLNewsParser(DOMParserBase):
'date': x.get('fromdate').split('|')[0].strip(),
'from': x.get('fromdate').split('|')[1].replace('From ',
'').strip(),
'body': x.get('body').strip(),
'body': (x.get('body') or u'').strip(),
'link': _normalize_href(x.get('link')),
'full article link': _normalize_href(x.get('fulllink'))
}))
......
......@@ -63,24 +63,24 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
_birth_attrs = [Attribute(key='birth date',
path={
'day': "./a[starts-with(@href, " \
'day': "./p/a[starts-with(@href, " \
"'/OnThisDay?')]/text()",
'year': "./a[starts-with(@href, " \
'year': "./p/a[starts-with(@href, " \
"'/BornInYear?')]/text()"
},
postprocess=build_date),
Attribute(key='birth notes',
path="./a[starts-with(@href, '/BornWhere?')]/text()")]
path="./p/a[starts-with(@href, '/BornWhere?')]/text()")]
_death_attrs = [Attribute(key='death date',
path={
'day': "./a[starts-with(@href, " \
'day': "./p/a[starts-with(@href, " \
"'/OnThisDay?')]/text()",
'year': "./a[starts-with(@href, " \
'year': "./p/a[starts-with(@href, " \
"'/DiedInYear?')]/text()"
},
postprocess=build_date),
Attribute(key='death notes',
path="./text()",
path="./p/text()",
# TODO: check if this slicing is always correct
postprocess=lambda x: x.strip()[2:])]
_film_attrs = [Attribute(key=None,
......@@ -121,7 +121,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
Extractor(label='akas',
path="//div[h5='Alternate Names:']",
attrs=Attribute(key='akas',
path="./text()",
path="./p/text()",
postprocess=lambda x: x.strip().split(' | '))),
Extractor(label='filmography',
......
......@@ -39,17 +39,11 @@ class DOMHTMLTop250Parser(DOMParserBase):
result = tparser.parse(top250_html_string)
"""
label = 'top 250'
h1text = 'Top 250'
ranktext = 'top 250 rank'
def _init(self):
# XXX: for some reason /..//tr[@valign] returns an empty list
# with bsoup. On the other hand, using /../..//tr[@valign]
# returns every item TWO times (see the work-around in
# postprocess_data. Very odd.
self.extractors = [Extractor(label=self.label,
path="//table//h1[starts-with(text(), '" + \
self.h1text + "')]/../..//tr[@valign]",
path="//div[@id='main']//table//tr",
attrs=Attribute(key=None,
multi=True,
path={self.ranktext: "./td[1]//text()",
......@@ -65,6 +59,7 @@ class DOMHTMLTop250Parser(DOMParserBase):
mlist = []
data = data[self.label]
# Avoid duplicates. A real fix, using XPath, is auspicabile.
# XXX: probably this is no more needed.
seenIDs = []
for d in data:
if 'movieID' not in d: continue
......@@ -101,7 +96,6 @@ class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
result = tparser.parse(bottom100_html_string)
"""
label = 'bottom 100'
h1text = 'Bottom 100'
ranktext = 'bottom 100 rank'
......
......@@ -258,7 +258,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
air_date = air_date[0]
vi = air_date.find('(')
if vi != -1:
date = air_date[:vi].strip()
date = _unHtml(air_date[:vi]).strip()
if date != '????':
d['original air date'] = date
air_date = air_date[vi:]
......@@ -321,7 +321,8 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1)
if cvurl: d['cover url'] = cvurl[0]
genres = _findBetween(cont, 'href="/Sections/Genres/', '/')
if genres: d['genres'] = genres
if genres:
d['genres'] = list(set(genres))
ur = _findBetween(cont, '<div class="meta">', '</div>', maxRes=1)
if ur:
rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1)
......@@ -392,7 +393,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
runtimes = runtimes[0]
runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1)
for x in runtimes.split('|')]
d['runtimes'] = runtimes
d['runtimes'] = [_unHtml(x).strip() for x in runtimes]
if kind == 'episode':
# number of episodes.
epsn = _findBetween(cont, 'title="Full Episode List">', '</a>',
......@@ -437,13 +438,13 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
if plotoutline:
plotoutline = plotoutline[0].strip()
plotoutline = plotoutline.rstrip('|').rstrip()
if plotoutline: d['plot outline'] = plotoutline
if plotoutline: d['plot outline'] = _unHtml(plotoutline)
aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'],
maxRes=1)
if aratio:
aratio = aratio[0].strip().replace(' (', '::(', 1)
if aratio:
d['aspect ratio'] = aratio
d['aspect ratio'] = _unHtml(aratio)
return {'data': d}
def get_movie_plot(self, movieID):
......
......@@ -28,6 +28,7 @@ from copy import copy, deepcopy
from time import strptime, strftime
from imdb import VERSION
from imdb import articles
from imdb._exceptions import IMDbParserError
# The regular expression for the "long" year format of IMDb, like
......@@ -168,61 +169,20 @@ def build_name(name_dict, canonical=None):
return name
# List of articles.
# XXX: Managing titles in a lot of different languages, a function to recognize
# an initial article can't be perfect; sometimes we'll stumble upon a short
# word that is an article in some language, but it's not in another; in these
# situations we have to choose if we want to interpret this little word
# as an article or not (remember that we don't know what the original language
# of the title was).
# Example: 'da' is an article in (I think) Dutch and it's used as an article
# even in some American slangs. Unfortunately it's also a preposition in
# Italian, and it's widely used in Mandarin (for whatever it means!).
# Running a script over the whole list of titles (and aliases), I've found
# that 'da' is used as an article only 23 times, and as another thing 298
# times, so I've decided to _always_ consider 'da' as a non article.
#
# Here is a list of words that are _never_ considered as articles, complete
# with the cound of times they are used in a way or another:
# 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99),
# 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70),
# 'da' (23 vs 298), "'n" (8 vs 12)
#
# I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56)
# I'm not sure what '-al' is, and so I've left it out...
#
# List of articles in utf-8 encoding:
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los',
'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os',
'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-',
'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines',
'\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9')
_unicodeArticles = tuple([art.decode('utf_8') for art in _articles])
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
# Unicode version.
_unicodeArticlesDict = dict([(x, x) for x in _unicodeArticles])
_spArticles = []
# Variations with a trailing space.
for article in _articles:
if article[-1] not in ("'", '-'): article += ' '
_spArticles.append(article)
_spUnicodeArticles = []
for article in _unicodeArticles:
if article[-1] not in ("'", '-'): article += u' '
_spUnicodeArticles.append(article)
articlesDicts = (_articlesDict, _unicodeArticlesDict)
spArticles = (_spArticles, _spUnicodeArticles)
def canonicalTitle(title):
# XXX: here only for backward compatibility. Find and remove any dependency.
_articles = articles.GENERIC_ARTICLES
_unicodeArticles = articles.toUnicode(_articles)
articlesDicts = articles.articlesDictsForLang(None)
spArticles = articles.spArticlesForLang(None)
def canonicalTitle(title, lang=None):
"""Return the title in the canonic format 'Movie Title, The';
beware that it doesn't handle long imdb titles, but only the
title portion, without year[/imdbIndex] or special markup."""
title portion, without year[/imdbIndex] or special markup.
The 'lang' argument can be used to specify the language of the title.
"""
isUnicode = isinstance(title, unicode)
articlesDicts = articles.articlesDictsForLang(lang)
try:
if title.split(', ')[-1].lower() in articlesDicts[isUnicode]:
return title
......@@ -233,11 +193,13 @@ def canonicalTitle(title):
else:
_format = '%s, %s'
ltitle = title.lower()
spArticles = articles.spArticlesForLang(lang)
for article in spArticles[isUnicode]:
if ltitle.startswith(article):
lart = len(article)
title = _format % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
if article[-1] == ' ':
title = title[:-1]
break