Commit a0580248 authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 2.8

parent 613203f0
......@@ -40,7 +40,7 @@ from imdb._exceptions import IMDbParserError
re_nameImdbIndex = re.compile(r'\(([IVXLCDM]+)\)')
HELP = """imdbpy2sql usage:
HELP = """imdbpy2sql.py usage:
%s -d /directory/with/PlainTextDataFiles/ -u URI
# NOTE: URI is something along the line:
......@@ -401,7 +401,7 @@ class _BaseCache(dict):
self._tmpDict = {}
self._flushing = 0
self._deferredData = {}
self._recursionLevel = 1
self._recursionLevel = 0
if d is not None:
for k, v in d.iteritems(): self[k] = v
......@@ -417,14 +417,13 @@ class _BaseCache(dict):
else:
self._deferredData[key] = self.counter.next()
def flush(self, quiet=0, _resetRecursion=1):
def flush(self, quiet=0, _recursionLevel=0):
"""Flush to the database."""
if self._flushing: return
self._flushing = 1
if _resetRecursion: self._recursionLevel = 1
if self._recursionLevel >= 5:
if _recursionLevel >= 5:
print 'WARNING recursion level exceded trying to flush data'
print 'WARNING this batch of data is lost.'
print 'WARNING this batch of data is lost (%s).' % self.className
self._tmpDict.clear()
return
if self._tmpDict:
......@@ -433,36 +432,28 @@ class _BaseCache(dict):
self._tmpDict.clear()
except OperationalError, e:
# Dataset too large; split it in two and retry.
print ' * TOO MANY DATA (%s items), SPLITTING (run #%d)...' % \
(len(self._tmpDict), self._recursionLevel)
self._recursionLevel += 1
c1 = self.__class__()
c2 = self.__class__()
newflushEvery = self.flushEvery / 2
c1.flushEvery = newflushEvery
c1._recursionLevel = self._recursionLevel
c2.flushEvery = newflushEvery
c2._recursionLevel = self._recursionLevel
if self.className == 'MoviesCache':
c1.episodesYear = c2.episodesYear = self.episodesYear
elif self.className == 'AkasMoviesCache':
c1.notes = c2.notes = self.notes
c1.ids = c2.ids = self.ids
## new code!
## the same class instance (self) is used, instead of
## creating two separated objects.
_recursionLevel += 1
self._flushing = 0
firstHalf = {}
poptmpd = self._tmpDict.popitem
for x in xrange(len(self._tmpDict)/2):
originalLength = len(self._tmpDict)
for x in xrange(1 + originalLength/2):
k, v = poptmpd()
c1._tmpDict[k] = v
c2._tmpDict = self._tmpDict
c1.flush(quiet=quiet, _resetRecursion=0)
c1._tmpDict.clear()
if len(c1) > 0:
self.update(c1)
del c1
c2.flush(quiet=quiet, _resetRecursion=0)
c2._tmpDict.clear()
if len(c2) > 0:
self.update(c2)
del c2
firstHalf[k] = v
print ' * TOO MANY DATA (%s items in %s), recursion: %s' % \
(originalLength,
self.className,
_recursionLevel)
print ' * SPLITTING (run 1 of 2), recursion: %s' % \
_recursionLevel
self.flush(quiet=quiet, _recursionLevel=_recursionLevel)
self._tmpDict = firstHalf
print ' * SPLITTING (run 2 of 2), recursion: %s' % \
_recursionLevel
self.flush(quiet=quiet, _recursionLevel=_recursionLevel)
self._tmpDict.clear()
self._flushing = 0
# Flush also deferred data.
......
......@@ -9,6 +9,14 @@ I'd like to thank the following people for their help:
* Ana Guerrero, for the official debian package.
* Jay Klein for a bug report and testing to fix a nasty bug in the
imdbpy2sql.py script (splitting too large data sets).
* Ivan Garcia for an important bug report about the use of IMDbPY
within wxPython programs.
* Kessia Pinheiro for a bug report about tv series list of episodes.
* Michael G. Noll for a bug report and a patch to fix a bug
retrieving 'plot keywords'.
......
Changelog for IMDbPY
====================
* What's the new in release 2.8 "Apollo 13" (14 Dec 2006)
[general]
- fix for environments where sys.stdin was overridden by a custom object.
[http data access system]
- added support for the movies' "FAQ" page.
- now the "full credits" (aka "full cast and crew") page can be parsed;
it's mostly useful for tv series, because this page is complete while
"combined details" contains only partial data.
E.g.
ia.update(tvSeries, 'full credits')
- added support for the movies' "on television" (ia.update(movie, "airing"))
- fixed a bug with 'miscellaneous companies'.
- fixed a bug retrieving the list of episodes for tv series.
- fixed a bug with tv series episodes' cast.
- generic fix for XML single tags (unvalid HTML tags) like <br/>
- fixed a minor bug with 'original air date'.
[sql data access system]
- fix for a unicode bug with recent versions of SQLObject and MySQL.
- fix for a nasty bug in imdbpy2sql.py that will show up splitting a
data set too large to be sent in a single shot to the database.
[mobile data access system]
- fixed a bug searching titles and names, where XML char references
were not converted.
* What's the new in release 2.7 "Pitch Black" (26 Sep 2006)
[general]
- fixed search_movie.py and search_person.py scripts; now they return
......
......@@ -94,6 +94,28 @@ Summary of keys of the Movie object for a series:
'episodes': dictionary (seasons) of dictionary (episodes in the season).
FULL CREDITS
============
Retrieving credits for a tv series, you may notice that many long lists
(like "cast", "writers", ...) are incomplete.
You can fetch the complete list of cast and crew with the "full credits"
data set; e.g.:
from imdb import IMDb
i = IMDb()
m = i.get_movie('0285331') # 24.
print len(m['cast']) # wooah! Only 7 person in the cast of 24?!?!
i.update(m, 'full credits')
print len(m['cast']) # yup! More than 300 persons!
RATINGS
=======
You can retrieve rating information about every episode in a tv series
using the 'episodes rating' data set.
PEOPLE
======
......
NOTE: this release (2.6) contains huge performances improvements;
while still using the SQLObject ORM, the imdbpy2sql.py script can take
just a pair of hours to complete, instead of the 6 or more hours of the
previous version.
If you want to help, please subscribe the imdbpy-devel mailing list at:
NOTE: the imdbpy2sql.py script, used to populate a database using
the data in the IMDb's plain text data files, is a critical piece
of IMDbPY: it's based on the SQLObject ORM to be database-independent
and contains a lot of tricks to be as fast as possible; however there
are huge margins for improvements; if you want to help, please read the
TODO.txt file and subscribe the imdbpy-devel mailing list at:
http://imdbpy.sf.net/?page=help#ml
......@@ -90,15 +90,17 @@ database requires a call to the execute() method for every single row
of data, and they will be much slower - from 2 to 7 times slower than
MySQL.
I've done some tests, using an AMD Athlon 1800+, 512MB of RAM:
I've done some tests, using an AMD Athlon 1800+, 512MB of RAM, over a
complete plain text data files set (as of 12 Nov 2006, with about
890.000 titles and over 2.000.000 names):
database | time in minutes: total (insert data/create indexes)
----------------------+-----------------------------------------------------
MySQL 5.0 MyISAM | 95 (75/20)
MySQL 5.0 MyISAM | 115 (95/20)
MYSQL 5.0 InnoDB | ??? (80/???)
| maybe I've not cofigurated it properly: it
| looks like the creation of the indexes will
| more then 2 or 3 hours.
| take more than 2 or 3 hours.
PostgreSQL 8.1 | 190 (177/13)
SQLite 3.2 | not tested: it seems way too slow: maybe 35 _hours_
| to complete; maybe I've misconfigured or I'm
......@@ -107,12 +109,21 @@ I've done some tests, using an AMD Athlon 1800+, 512MB of RAM:
If you have different experiences, please tell me!
NOTE
====
NOTES
=====
[save the output]
The imdbpy2sql.py will print a lot of debug information on standard output;
you can save it in a file, appending (without quotes) "2>&1 | tee output.txt"
[adult titles]
Beware that, while running, the imdbpy2sql.py script will output a lot
of strings containing both person names and movie titles. The script
has absolutely no way to know that the processed title is an adult-only
movie, so... if you leave it running and your little daughter runs to you
screaming 'daddy! daddy! what kind of animals trains Rocco in the
documentary "Rocco: Animal Trainer 17"???'... well it's not my fault! ;-)
SQLITE NOTE
===========
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -85,8 +85,8 @@ class Movie(_Container):
'miscellaneouscrew': 'miscellaneous crew',
'crewmembers': 'miscellaneous crew',
'crew members': 'miscellaneous crew',
'misc companies': 'other companies',
'miscellaneous companies': 'other companies',
'other companies': 'miscellaneous companies',
'misc companies': 'miscellaneous companies',
'aka': 'akas',
'also known as': 'akas',
'country': 'countries',
......@@ -114,12 +114,14 @@ class Movie(_Container):
'amazon review': 'amazon reviews',
'merchandising': 'merchandising links',
'merchandise': 'merchandising links',
'sales': 'merchandising links'}
'sales': 'merchandising links',
'faq': 'faqs',
'frequently asked questions': 'faqs'}
keys_tomodify_list = ('plot', 'trivia', 'alternate versions', 'goofs',
'quotes', 'dvd', 'laserdisc', 'news', 'soundtrack',
'crazy credits', 'business',
'supplements', 'video review')
'crazy credits', 'business', 'supplements',
'video review', 'faqs')
cmpFunct = cmpMovies
......
......@@ -74,7 +74,7 @@ def IMDb(accessSystem='http', *arguments, **keywords):
# objects can use this default encoding, returning strings.
# Anyway, passing unicode strings to search_movie() and search_person()
# methods is always safer.
encoding = sys.stdin.encoding or sys.getdefaultencoding()
encoding = getattr(sys.stdin, 'encoding', '') or sys.getdefaultencoding()
class IMDbBase:
"""The base class used to search for a movie/person and to get a
......
......@@ -147,7 +147,7 @@ def sortedEpisodes(m, season=None):
return episodes
# Idea an portions of the code courtesy of none none (dclist at gmail.com)
# Idea and portions of the code courtesy of none none (dclist at gmail.com)
_re_imdbIDurl = re.compile(r'\b(nm|tt)([0-9]{7})\b')
def get_byURL(url, info=None, args=None, kwds=None):
"""Return a Movie or Person object for the given URL; info is the
......
......@@ -43,7 +43,7 @@ from movieParser import movie_parser, plot_parser, movie_awards_parser, \
soundclips_parser, videoclips_parser, news_parser, \
photosites_parser, amazonrev_parser, guests_parser, \
business_parser, sales_parser, episodes_parser, \
eprating_parser
eprating_parser, movie_faqs_parser, airing_parser
from searchMovieParser import search_movie_parser
from personParser import maindetails_parser, bio_parser, \
otherworks_parser, person_awards_parser, \
......@@ -268,6 +268,10 @@ class IMDbHTTPAccessSystem(IMDbBase):
cont = self._retrieve(imdbURL_movie % movieID + 'maindetails')
return movie_parser.parse(cont)
def get_movie_full_credits(self, movieID):
cont = self._retrieve(imdbURL_movie % movieID + 'fullcredits')
return movie_parser.parse(cont)
def get_movie_plot(self, movieID):
cont = self._retrieve(imdbURL_movie % movieID + 'plotsummary')
return plot_parser.parse(cont)
......@@ -413,6 +417,14 @@ class IMDbHTTPAccessSystem(IMDbBase):
episode['episode of'].movieID = movieID
return data_d
def get_movie_faqs(self, movieID):
cont = self._retrieve(imdbURL_movie % movieID + 'faq')
return movie_faqs_parser.parse(cont)
def get_movie_airing(self, movieID):
cont = self._retrieve(imdbURL_movie % movieID + 'tvschedule')
return airing_parser.parse(cont)
def _search_person(self, name, results):
# The URL of the query.
# XXX: To retrieve the complete results list:
......
This diff is collapsed.
......@@ -96,6 +96,10 @@ sgmlentity.update(dict([('#34', u'"'), ('#38', u'&'),
re_sgmlref = re.compile('&(%s);' % '|'.join(map(re.escape, sgmlentity)))
re_sgmlrefsub = re_sgmlref.sub
# Matches XML-only single tags, like <br/> ; they are unvalid in HTML,
# but sometimes they can be found.
re_xmltags = re.compile('<([a-zA-Z]+)/>')
def _replXMLRef(match):
"""Replace the matched XML/HTML entities and references;
......@@ -170,8 +174,8 @@ class ParserBase(SGMLParser):
self._titlesRefs = {}
self._titleRefCID = ''
self._nameRefCID = ''
self._titleCN = ''
self._nameCN = ''
self._titleCN = u''
self._nameCN = u''
self._inTTRef = 0
self._inLinkTTRef = 0
self._inNMRef = 0
......@@ -223,7 +227,7 @@ class ParserBase(SGMLParser):
except IMDbParserError:
pass
self._titleRefCID = ''
self._titleCN = ''
self._titleCN = u''
self._inTTRef = 0
self._inLinkTTRef = 0
elif self._nameRefCID and self._nameCN:
......@@ -239,7 +243,7 @@ class ParserBase(SGMLParser):
except IMDbParserError:
pass
self._nameRefCID = ''
self._nameCN = ''
self._nameCN = u''
self._inNMRef = 0
def _refs_anchor_bgn(self, attrs):
......@@ -304,6 +308,8 @@ class ParserBase(SGMLParser):
if not isinstance(html_string, UnicodeType):
html_string = unicode(html_string, 'latin_1', 'replace')
html_string = subXMLRefs(html_string)
# Fix invalid HTML single tags like <br/>
html_string = re_xmltags.sub('<\\1 />', html_string)
self.feed(html_string)
if self.getRefs and self._inTTRef: self._add_ref('tt')
data = self.get_data()
......
......@@ -160,7 +160,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
#params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
#cont = self._mretrieve(imdbURL_search % params)
cont = self._get_search_content('tt', title, results)
cont = subXMLRefs(self._get_search_content('tt', title, results))
title = _findBetween(cont, '<title>', '</title>')
res = []
if not title: return res
......@@ -373,7 +373,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
#params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
#cont = self._mretrieve(imdbURL_search % params)
cont = self._get_search_content('nm', name, results)
cont = subXMLRefs(self._get_search_content('nm', name, results))
name = _findBetween(cont, '<title>', '</title>')
res = []
if not name: return res
......
......@@ -177,6 +177,13 @@ class IMDbSqlAccessSystem(IMDbLocalAndSqlAccessSystem):
self._moviesubs.update(_busd)
self.do_adult_search(adultSearch)
def _buildNULLCondition(self, col, val):
"""Build a comparison for columns where values can be NULL."""
if val is None:
return ISNULL(col)
else:
return col == val
def _getTitleID(self, title):
"""Given a long imdb canonical title, returns a movieID or
None if not found."""
......@@ -186,22 +193,28 @@ class IMDbSqlAccessSystem(IMDbLocalAndSqlAccessSystem):
epof = td['episode of']
seriesID = [s.id for s in Title.select(
AND(Title.q.title == epof['title'].encode('utf_8'),
Title.q.imdbIndex == epof.get('imdbIndex'),
self._buildNULLCondition(Title.q.imdbIndex,
epof.get('imdbIndex')),
Title.q.kindID == self._kindRev[epof['kind']],
Title.q.productionYear == epof.get('year')))]
if seriesID:
condition = AND(IN(Title.q.episodeOfID, seriesID),
Title.q.title == td['title'].encode('utf_8'),
Title.q.imdbIndex == td.get('imdbIndex'),
self._buildNULLCondition(Title.q.imdbIndex,
td.get('imdbIndex')),
Title.q.kindID == self._kindRev[td['kind']],
Title.q.productionYear == td.get('year'))
if condition is None:
condition = AND(Title.q.title == td['title'].encode('utf_8'),
Title.q.imdbIndex == td.get('imdbIndex'),
self._buildNULLCondition(Title.q.imdbIndex,
td.get('imdbIndex')),
Title.q.kindID == self._kindRev[td['kind']],
Title.q.productionYear == td.get('year'))
res = Title.select(condition)
if res.count() != 1:
try:
if res.count() != 1:
return None
except UnicodeDecodeError:
return None
return res[0].id
......@@ -210,8 +223,13 @@ class IMDbSqlAccessSystem(IMDbLocalAndSqlAccessSystem):
None if not found."""
nd = analyze_name(name)
res = Name.select(AND(Name.q.name == nd['name'].encode('utf_8'),
Name.q.imdbIndex == str(nd.get('imdbIndex'))))
if res.count() != 1:
self._buildNULLCondition(Name.q.imdbIndex,
nd.get('imdbIndex'))))
try:
c = res.count()
if res.count() != 1:
return None
except UnicodeDecodeError, e:
return None
return res[0].id
......
......@@ -141,7 +141,11 @@ DB_TABLES = [Name, KindType, Title, AkaName, AkaTitle, RoleType, CastInfo,
def setConnection(uri, debug=False):
"""Set connection for every table."""
conn = connectionForURI(uri)
kw = {}
if uri.lower().startswith('mysql'):
kw['use_unicode'] = 1
kw['sqlobject_encoding'] = 'utf8'
conn = connectionForURI(uri, **kw)
conn.debug = debug
for table in DB_TABLES:
table.setConnection(conn)
......
......@@ -839,6 +839,9 @@ class _Container:
return 0
return 1
# XXX: really useful???
# consider also that this will confuse people who meant to
# call ia.update(movieObject, 'data set') instead.
def update(self, dict):
self.data.update(dict)
......
......@@ -34,7 +34,7 @@ DO_SCRIPTS = 1
# version of the software; CVS releases contain a string
# like ".cvsYearMonthDay(OptionalChar)".
version = '2.7'
version = '2.8'
home_page = 'http://imdbpy.sf.net/'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment