Upgrading to GitLab 11.10.0. Expect errors and see debian-infrastructure-announce@lists.debian.org for further information.

Commit 9e0f4d0f authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 3.2

parent 3b788bce
......@@ -65,11 +65,19 @@ HELP = """imdbpy2sql.py usage:
IMDB_PTDF_DIR = None
# URI used to connect to the database.
URI = None
# Store custom queries specified on the command line.
CUSTOM_QUERIES = {}
# Allowed time specification, for custom queries.
ALLOWED_TIMES = ('BEGIN', 'BEFORE_DROP', 'BEFORE_CREATE', 'AFTER_CREATE',
'BEFORE_MOVIES', 'BEFORE_CAST', 'BEFORE_RESTORE',
'BEFORE_INDEXES', 'END')
# Manage arguments list.
try:
optlist, args = getopt.getopt(sys.argv[1:], 'u:d:h',
['uri=', 'data=', 'help'])
optlist, args = getopt.getopt(sys.argv[1:], 'u:d:e:h',
['uri=', 'data=', 'execute='
'help'])
except getopt.error, e:
print 'Troubles with arguments.'
print HELP
......@@ -80,6 +88,15 @@ for opt in optlist:
IMDB_PTDF_DIR = opt[1]
elif opt[0] in ('-u', '--uri'):
URI = opt[1]
elif opt[0] in ('-e', '--execute'):
if opt[1].find(':') == -1:
print 'WARNING: wrong command syntax: "%s"' % opt[1]
continue
when, cmd = opt[1].split(':', 1)
if when not in ALLOWED_TIMES:
print 'WARNING: unknown time: "%s"' % when
continue
CUSTOM_QUERIES.setdefault(when, []).append(cmd)
elif opt[0] in ('-h', '--help'):
print HELP
sys.exit(0)
......@@ -826,7 +843,11 @@ def doCast(fp, roleid, rolename):
for item in ll[1:]:
if not item: continue
if item[0] == '[':
role = item[1:].rstrip(']')
# Quite inefficient, but there are some very strange
# cases of garbage in the plain text data files to handle...
role = item[1:]
if role[-1:] == ']':
role = role[:-1]
if role[-1:] == ')':
nidx = role.find('(')
if nidx != -1:
......@@ -1525,10 +1546,42 @@ def restoreImdbID(tons, cls):
print 'DONE! (restored %d entries out of %d)' % (count, len(tons))
def _executeQuery(query):
print 'EXECUTING "%s"...' % (query),
sys.stdout.flush()
try:
CURS.execute(query)
print 'DONE!'
except Exception, e:
print 'FAILED (%s)!' % e
def executeCustomQueries(when):
"""Run custom queries as specified on the command line."""
for query in CUSTOM_QUERIES.get(when, []):
print 'EXECUTING "%s:%s"...' % (when, query)
sys.stdout.flush()
if query.startswith('FOR_EVERY_TABLE:'):
query = query[16:]
CURS.execute('SHOW TABLES;')
tables = [x[0] for x in CURS.fetchall()]
for table in tables:
try:
_executeQuery(query % {'table': table})
t('%s command' % when)
except Exception, e:
print 'FAILED (%s)!' % e
else:
_executeQuery(query)
t('%s command' % when)
# begin the iterations...
def run():
print 'RUNNING imdbpy2sql.py'
executeCustomQueries('BEGIN')
# Storing imdbIDs for movies and persons.
try:
movies_imdbIDs = notNULLimdbID(Title)
......@@ -1541,12 +1594,16 @@ def run():
people_imdbIDs = []
print 'WARNING: failed to read imdbIDs for people'
executeCustomQueries('BEFORE_DROP')
# Truncate the current database.
print 'DROPPING current database...',
sys.stdout.flush()
dropTables()
print 'DONE!'
executeCustomQueries('BEFORE_CREATE')
# Rebuild the database structure.
print 'CREATING new tables...',
sys.stdout.flush()
......@@ -1554,9 +1611,13 @@ def run():
print 'DONE!'
t('dropping and recreating the database')
executeCustomQueries('AFTER_CREATE')
# Read the constants.
readConstants()
executeCustomQueries('BEFORE_CAST')
# Populate the CACHE_MID instance.
readMovieList()
# Comment readMovieList() and uncomment the following two lines
......@@ -1605,6 +1666,8 @@ def run():
completeCast()
t('completeCast()')
executeCustomQueries('BEFORE_RESTORE')
# Restoring imdbIDs for movies and persons.
try:
restoreImdbID(movies_imdbIDs, Title)
......@@ -1627,12 +1690,16 @@ def run():
print 'TOTAL TIME TO INSERT DATA: %d minutes, %d seconds' % \
divmod(int(time.time())-BEGIN_TIME, 60)
executeCustomQueries('BEFORE_INDEXES')
print 'building database indexes (this may take a while)'
sys.stdout.flush()
# Build database indexes.
createIndexes()
t('createIndexes()')
executeCustomQueries('END')
print 'DONE! (in %d minutes, %d seconds)' % \
divmod(int(time.time())-BEGIN_TIME, 60)
......
......@@ -15,6 +15,9 @@ I'd like to thank the following people for their help:
* Jesper Nøhr for a lot of testing, especially on the 'sql'.
* Ken R. Garland for a bug report about 'cover url' and a lot of
other hints.
* Jon Sabo for a bug report about unicode and the imdbpy2sql.py script
and some feedback.
......
Changelog for IMDbPY
====================
* What's the new in release 3.2 "Videodrome" (25 Sep 2007)
[global]
- now there's an unique place where "akas.imdb.com" is set, in the
main module.
- introduced __version__ and VERSION in the main module.
- minor improvements to the documentation.
[http]
- updated the main movie parser to retrieve the recently modified
cast section.
- updated the crazy credits parser.
- fixed a bug retrieving 'cover url'.
[mobile]
- fixed a bug parsing people's filmography when only one duty
was listed.
- updated to retrieve series' creator.
[sql]
- added the ability to perform custom SQL queries at the command
line of the imdbpy2sql.py script.
- minor fixes for the imdbpy2sql.py script.
* What's the new in release 3.1 "The Snake King" (18 Jul 2007)
[global]
- the IMDbPYweb account now returns a single item, when a search
......
......@@ -27,6 +27,11 @@ SQLObject home page: http://sqlobject.org/
SVN command to download the latest development version:
svn co http://svn.colorstudy.com/SQLObject/trunk SQLObject
Obviously the SQLObject can access databases only through other
specific modules/packages, that you need to have installed (e.g.:
python-mysqldb for MySQL, python-psycopg for PostgreSQL, python-sqlite
for SQLite and so on).
SQL DATABASE INSTALLATION
=========================
......@@ -100,7 +105,7 @@ complete plain text data files set (as of 12 Nov 2006, with about
MySQL 5.0 InnoDB | ??? (80/???)
| maybe I've not cofigurated it properly: it
| looks like the creation of the indexes will
| take more than 2 or 3 hours.
| take more than 2 or 3 hours. But see NOTES below.
PostgreSQL 8.1 | 190 (177/13)
SQLite 3.2 | not tested: it seems way too slow: maybe 35 _hours_
| to complete; maybe I've misconfigured or I'm
......@@ -116,7 +121,19 @@ If you have different experiences, please tell me!
The imdbpy2sql.py will print a lot of debug information on standard output;
you can save it in a file, appending (without quotes) "2>&1 | tee output.txt"
[sqlite failure]
[MySQL InnoDB]
InnoDB is abysmal slow for our purposes: my suggestion is to always
use MyISAM tables and - if you really want to use InnoDB - convert
the tables later.
The imdbpy2sql.py script provides a simple way to manage this case,
see ADVANCED FEATURES below.
Anyway, if you really need to use InnoDB, in the server-side settings
I recommend to set innodb_file_per_table to "true".
Beware that the conversion will be extremely slow (some hours), but
still faster than using InnoDB from the begin.
[SQLite failure]
It seems that, with older versions of the python-sqlite package, the first
run may fail; if you get a DatabaseError exception saying "no such table",
try running again the command with the same arguments.
......@@ -169,3 +186,38 @@ Now you can use IMDbPY with the database:
and so on...
ADVANCED FEATURES
=================
With the -e (or --execute) command line argument you can specify
custom queries to be executed at certain times, with the syntax:
-e "TIME:[OPTIONAL_MODIFIER:]QUERY"
Where TIME is actually one of these: 'BEGIN', 'BEFORE_DROP', 'BEFORE_CREATE',
'AFTER_CREATE', 'BEFORE_MOVIES', 'BEFORE_CAST', 'BEFORE_RESTORE',
'BEFORE_INDEXES' and 'END'.
The only available OPTIONAL_MODIFIER is 'FOR_EVERY_TABLE' and it
means that the QUERY command will be executed for every table in the
database (so it doesn't make much sense to use it with BEGIN, BEFORE_DROP
or BEFORE_CREATE time...), replacing the "%(table)s" text in the QUERY
with the appropriate table name.
You can specify so many -e arguments as you need, even if they
refers to the same TIME: they will be executed from the first to the last.
Also, always remember to correctly escape queries: after all you're
passing it on the command line!
E.g. (ok, quite a silly example...):
-e "AFTER_CREATE:SELECT * FROM title;"
The most useful case is when you want to convert the tables of a MySQL
from MyISAM to InnoDB:
-e "END:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;"
If your system uses InnoDB by default, you can trick it with:
-e "AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;" -e "END:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;"
Cool, uh?
......@@ -24,6 +24,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person']
__version__ = VERSION = '3.2'
import sys
from types import UnicodeType, TupleType, ListType, MethodType
......@@ -33,9 +34,18 @@ from imdb._exceptions import IMDbError, IMDbDataAccessError
from imdb.utils import build_title, build_name
# URLs of the main pages for movies and persons.
imdbURL_movie_main = 'http://akas.imdb.com/title/tt%s/'
imdbURL_person_main = 'http://akas.imdb.com/name/nm%s/'
# URLs of the main pages for movies, persons and queries.
imdbURL_base = 'http://akas.imdb.com/'
# http://akas.imdb.com/title/
imdbURL_movie_base = '%stitle/' % imdbURL_base
# http://akas.imdb.com/title/tt%s/
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
# http://akas.imdb.com/name/
imdbURL_person_base = '%sname/' % imdbURL_base
# http://akas.imdb.com/name/nm%s/
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
# http://akas.imdb.com/find?%s
imdbURL_find = imdbURL_base + 'find?%s'
def IMDb(accessSystem='http', *arguments, **keywords):
......@@ -325,7 +335,7 @@ class IMDbBase:
def _searchIMDb(self, params):
"""Fetch the given search page from the IMDb akas server."""
from imdb.parser.http import IMDbURLopener
url = 'http://akas.imdb.com/find?%s' % params
url = imdbURL_find % params
content = u''
try:
urlOpener = IMDbURLopener()
......
......@@ -30,7 +30,7 @@ from types import UnicodeType, TupleType, ListType
# The modClearRefs can be used to strip names and titles references from
# the strings in Movie and Person objects.
from utils import modClearRefs, re_titleRef, re_nameRef
from imdb import IMDb
from imdb import IMDb, imdbURL_movie_base, imdbURL_person_base
from imdb.parser.http.utils import re_entcharrefssub, entcharrefs, \
entcharrefsget, subXMLRefs, subSGMLRefs
......@@ -92,8 +92,8 @@ def makeModCGILinks(movieTxt, personTxt, encoding='latin_1'):
return modCGILinks
# links to the imdb.com web site.
_movieTxt = '<a href="http://akas.imdb.com/title/tt%(movieID)s">%(title)s</a>'
_personTxt = '<a href="http://akas.imdb.com/name/nm%(personID)s">%(name)s</a>'
_movieTxt = '<a href="' + imdbURL_movie_base + 'tt%(movieID)s">%(title)s</a>'
_personTxt = '<a href="' + imdbURL_person_base + 'nm%(personID)s">%(name)s</a>'
modHtmlLinks = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt)
modHtmlLinksASCII = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt,
encoding='ascii')
......
This diff is collapsed.
......@@ -29,6 +29,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import re
from types import UnicodeType, StringType
from imdb import imdbURL_base
from imdb.Person import Person
from imdb.Movie import Movie
from imdb.utils import analyze_title, re_episodes
......@@ -218,6 +219,9 @@ class HTMLMovieParser(ParserBase):
# httpThin is used (they are the only place these info are).
self._in_tr = True
start_h3 = start_h5
end_h3 = end_h5
def start_h6(self, attrs):
# Production status is in h6 tags.
if self._in_production_notes:
......@@ -375,7 +379,7 @@ class HTMLMovieParser(ParserBase):
elif self._section == 'miscellaneous companies':
self._data.setdefault(self._section,
[]).append(ct.replace(' ', '::', 1))
else:
elif self._section != 'cast':
self._data.setdefault(self._section, []).append(ct)
self._cur_txt = u''
......@@ -437,7 +441,8 @@ class HTMLMovieParser(ParserBase):
self._in_total_episodes = True
elif href and href.startswith('/chart/top?tt'):
self._in_top250 = True
elif href and href.endswith('photogallery') and \
elif href and (href.endswith('photogallery') or \
href.endswith('posters')) and \
self.get_attr_value(attrs, 'name') == 'poster':
self._in_poster = True
# From here on, we're inside some kind of information and a href.
......@@ -1072,40 +1077,39 @@ class HTMLCrazyCreditsParser(ParserBase):
def _reset(self):
"""Reset the parser."""
self._in_cc2 = 0
self._cc = []
self._in_cc = False
self._ccc = u''
self._nrbr = 0
def get_data(self):
"""Return the dictionary."""
if not self._cc: return {}
return {'crazy credits': self._cc}
def start_pre(self, attrs):
def start_ul(self, attrs):
if self._in_content:
self._in_cc2 = 1
self._in_cc = True
def end_pre(self):
if self._in_cc2:
self.app()
self._in_cc2 = 0
def end_ul(self):
self._in_cc = False
def do_br(self, attrs):
if not self._in_cc2: return
self._nrbr += 1
if self._nrbr == 2:
self.app()
if not self._in_cc: return
if self._ccc: self._ccc += u' '
def start_li(self, attrs):
self._ccc = u''
def app(self):
def end_li(self):
if not self._in_cc: return
self._ccc = self._ccc.strip()
if self._in_cc2 and self._ccc:
self._cc.append(self._ccc.replace('\n', ' '))
if self._ccc:
self._ccc = self._ccc.replace('\n', ' ').replace(' ', ' ')
self._cc.append(self._ccc)
self._ccc = u''
self._nrbr = 0
def _handle_data(self, data):
if self._in_cc2:
if self._in_cc:
self._ccc += data
......@@ -1203,6 +1207,12 @@ class HTMLQuotesParser(ParserBase):
def end_a(self): pass
def start_h3(self, attrs):
self._in_quo2 = 0
self._cquo = u''
def end_h3(self): pass
def do_hr(self, attrs):
if self._in_content and self._in_quo2 and self._cquo:
self._cquo = self._cquo.strip()
......@@ -1636,7 +1646,7 @@ class HTMLOfficialsitesParser(ParserBase):
if href:
if not href.lower().startswith('http://'):
if href.startswith('/'): href = href[1:]
href = 'http://akas.imdb.com/%s' % href
href = '%s%s' % (imdbURL_base, href)
self._cosl = href
def end_a(self): pass
......@@ -1987,7 +1997,8 @@ class HTMLDvdParser(ParserBase):
self._seencover = 1
src = self.get_attr_value(attrs, 'src')
if src and src.find('noposter') == -1:
if src[0] == '/': src = 'http://akas.imdb.com%s' % src
if src[0] == '/':
src = '%s%s' % (imdbURL_base, src[1:])
self._cdvd['cover'] = src
def start_p(self, attrs):
......@@ -2207,7 +2218,7 @@ class HTMLNewsParser(ParserBase):
if href:
if not href.startswith('http://'):
if href[0] == '/': href = href[1:]
href = 'http://akas.imdb.com/%s' % href
href = '%s%s' % (imdbURL_base, href)
self._cur_news['link'] = href
def _handle_data(self, data):
......@@ -2276,7 +2287,7 @@ class HTMLAmazonReviewsParser(ParserBase):
if href:
if not href.startswith('http://'):
if href[0] == '/': href = href[1:]
href = 'http://akas.imdb.com/%s' % href
href = '%s%s' % (imdbURL_base, href)
self._cur_link = href.strip()
def end_a(self): pass
......@@ -2477,7 +2488,8 @@ class HTMLSalesParser(ParserBase):
if href:
if self._get_img or self._get_link:
if href[0] == '/':
href = 'http://akas.imdb.com%s' % href
href = href[1:]
href = '%s%s' % (imdbURL_base, href)
self._cur_info['link'] = href
self._get_link = 0
......
......@@ -157,6 +157,8 @@ def build_person(txt, personID=None, billingPos=None, accessSystem='http'):
role_comment = role_comment[:-4].rstrip()
elif role_comment[-2:] == ' &':
role_comment = role_comment[:-2].rstrip()
elif role_comment[-6:] == '& ....':
role_comment = role_comment[:-6].rstrip()
# Get the notes.
cmt_idx = role_comment.find('(')
if cmt_idx != -1:
......
......@@ -27,13 +27,13 @@ import re
from urllib import unquote
from types import ListType, TupleType
from imdb import imdbURL_movie_main, imdbURL_person_main
from imdb.Movie import Movie
from imdb.Person import Person
from imdb.utils import analyze_title, analyze_name, canonicalName, \
re_episodes, date_and_notes
from imdb._exceptions import IMDbDataAccessError
from imdb.parser.http import IMDbHTTPAccessSystem, \
imdbURL_movie, imdbURL_person
from imdb.parser.http import IMDbHTTPAccessSystem
from imdb.parser.http.utils import subXMLRefs, subSGMLRefs, build_person, \
build_movie, re_spaces
......@@ -188,7 +188,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res
def get_movie_main(self, movieID):
cont = self._mretrieve(imdbURL_movie % movieID + 'maindetails')
cont = self._mretrieve(imdbURL_movie_main % movieID + 'maindetails')
title = _findBetween(cont, '<title>', '</title>', maxRes=1)
if not title:
raise IMDbDataAccessError, 'unable to get movieID "%s"' % movieID
......@@ -254,6 +254,12 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
'</div>',
'<br/> <br/>'),
maxRes=1)
if not creator:
# They change 'Created by' to 'Creator' and viceversa
# from time to time...
creator = _findBetween(cont, 'Creator:</h5>',
('class="tn15more"', '</div>',
'<br/> <br/>'), maxRes=1)
if creator:
creator = creator[0]
if creator.find('tn15more'): creator = '%s>' % creator
......@@ -392,7 +398,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return {'data': d}
def get_movie_plot(self, movieID):
cont = self._mretrieve(imdbURL_movie % movieID + 'plotsummary')
cont = self._mretrieve(imdbURL_movie_main % movieID + 'plotsummary')
plot = _findBetween(cont, '<p class="plotpar">', '</p>')
plot[:] = [_unHtml(x) for x in plot]
for i in xrange(len(plot)):
......@@ -445,7 +451,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res
def get_person_main(self, personID):
s = self._mretrieve(imdbURL_person % personID + 'maindetails')
s = self._mretrieve(imdbURL_person_main % personID + 'maindetails')
r = {}
name = _findBetween(s, '<title>', '</title>', maxRes=1)
if not name:
......@@ -477,6 +483,11 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
maxRes=1)
if workkind:
workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
else:
# Assume there's only one section and/or there are no
# section links, for some reason.
workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>')
workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind]
ws = []
for work in workkind:
wsplit = work.split('">', 1)
......@@ -531,7 +542,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return {'data': r, 'info sets': ('main', 'filmography')}
def get_person_biography(self, personID):
cont = self._mretrieve(imdbURL_person % personID + 'bio')
cont = self._mretrieve(imdbURL_person_main % personID + 'bio')
d = {}
spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
maxRes=1)
......
......@@ -34,7 +34,7 @@ DO_SCRIPTS = 1
# version of the software; CVS releases contain a string
# like ".cvsYearMonthDay(OptionalChar)".
version = '3.1'
version = '3.2'
home_page = 'http://imdbpy.sf.net/'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment