Commit 8cf69e1c authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 3.5

parent 6895fa63
...@@ -5,7 +5,7 @@ characters4local.py script. ...@@ -5,7 +5,7 @@ characters4local.py script.
This script creates some files to manage characters' information This script creates some files to manage characters' information
for the 'local' data access system. for the 'local' data access system.
Copyright 2007 Davide Alberani <da@erlug.linux.it> Copyright 2007-2008 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
...@@ -92,7 +92,13 @@ def doCast(dataF, roleCount=0): ...@@ -92,7 +92,13 @@ def doCast(dataF, roleCount=0):
if i < noWith: if i < noWith:
# Eat 'attributeID'. # Eat 'attributeID'.
fread(3) fread(3)
length = ord(fread(1)) try:
length = ord(fread(1))
except TypeError:
# Prevent the strange case where fread(1) returns '';
# it should not happen; maybe there's some garbage in
# the files...
length = 0
if length > 0: if length > 0:
curRole = fread(length) curRole = fread(length)
noterixd = curRole.rfind('(') noterixd = curRole.rfind('(')
...@@ -123,7 +129,7 @@ def doCast(dataF, roleCount=0): ...@@ -123,7 +129,7 @@ def doCast(dataF, roleCount=0):
def writeData(d, directory): def writeData(d, directory):
"""Write d data into file in the specified directory.""" """Write d data into files in the specified directory."""
# Open files. # Open files.
print 'Start writing data to directory %s.' % directory print 'Start writing data to directory %s.' % directory
char2id = anydbm.open(os.path.join(directory, 'character2id.index'), 'n') char2id = anydbm.open(os.path.join(directory, 'character2id.index'), 'n')
......
...@@ -5,7 +5,7 @@ imdbpy2sql.py script. ...@@ -5,7 +5,7 @@ imdbpy2sql.py script.
This script puts the data of the plain text data files into a This script puts the data of the plain text data files into a
SQL database. SQL database.
Copyright 2005-2007 Davide Alberani <da@erlug.linux.it> Copyright 2005-2008 Davide Alberani <da@erlug.linux.it>
2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it> 2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
...@@ -61,6 +61,7 @@ HELP = """imdbpy2sql.py usage: ...@@ -61,6 +61,7 @@ HELP = """imdbpy2sql.py usage:
# NOTE: --COMPATIBILITY-OPTIONS can be one of: # NOTE: --COMPATIBILITY-OPTIONS can be one of:
--mysql-innodb insert data into a MySQL MyISAM db, --mysql-innodb insert data into a MySQL MyISAM db,
and then convert it to InnoDB. and then convert it to InnoDB.
--mysql-force-myisam force the creation of MyISAM tables.
--ms-sqlserver compatibility mode for Microsoft SQL Server --ms-sqlserver compatibility mode for Microsoft SQL Server
and SQL Express. and SQL Express.
--sqlite-transactions uses transactions, to speed-up SQLite. --sqlite-transactions uses transactions, to speed-up SQLite.
...@@ -73,6 +74,8 @@ HELP = """imdbpy2sql.py usage: ...@@ -73,6 +74,8 @@ HELP = """imdbpy2sql.py usage:
IMDB_PTDF_DIR = None IMDB_PTDF_DIR = None
# URI used to connect to the database. # URI used to connect to the database.
URI = None URI = None
# Max allowed recursion, inserting data.
MAX_RECURSION = 10
# Store custom queries specified on the command line. # Store custom queries specified on the command line.
CUSTOM_QUERIES = {} CUSTOM_QUERIES = {}
# Allowed time specification, for custom queries. # Allowed time specification, for custom queries.
...@@ -82,20 +85,27 @@ ALLOWED_TIMES = ('BEGIN', 'BEFORE_DROP', 'BEFORE_CREATE', 'AFTER_CREATE', ...@@ -82,20 +85,27 @@ ALLOWED_TIMES = ('BEGIN', 'BEFORE_DROP', 'BEFORE_CREATE', 'AFTER_CREATE',
'AFTER_MOVIES_TODB', 'BEFORE_PERSONS_TODB', 'AFTER_MOVIES_TODB', 'BEFORE_PERSONS_TODB',
'AFTER_PERSONS_TODB','BEFORE_SQLDATA_TODB', 'AFTER_PERSONS_TODB','BEFORE_SQLDATA_TODB',
'AFTER_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB', 'AFTER_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB',
'AFTER_AKAMOVIES_TODB', 'BEFORE_EVERY_TODB', 'AFTER_AKAMOVIES_TODB', 'BEFORE_CHARACTERS_TODB',
'AFTER_CHARACTERS_TODB', 'BEFORE_EVERY_TODB',
'AFTER_EVERY_TODB') 'AFTER_EVERY_TODB')
# Shortcuts for some compatibility options. # Shortcuts for some compatibility options.
MYSQLFORCEMYISAM_OPTS = ['-e',
'AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;']
MYSQLINNODB_OPTS = ['-e', MYSQLINNODB_OPTS = ['-e',
'AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;', 'AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;',
'-e', 'END:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;'] '-e', 'END:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;']
SQLSERVER_OPTS = ['-e', 'BEFORE_EVERY_TODB:SET IDENTITY_INSERT %(table)s ON;', SQLSERVER_OPTS = ['-e', 'BEFORE_EVERY_TODB:SET IDENTITY_INSERT %(table)s ON;',
'-e', 'AFTER_EVERY_TODB:SET IDENTITY_INSERT %(table)s OFF;'] '-e', 'AFTER_EVERY_TODB:SET IDENTITY_INSERT %(table)s OFF;']
SQLITE_OPTS = ['-e', 'BEFORE_EVERY_TODB:BEGIN TRANSACTION;', SQLITE_OPTS = ['-e', 'BEFORE_EVERY_TODB:BEGIN TRANSACTION;',
'-e', 'AFTER_EVERY_TODB:COMMIT;'] '-e', 'AFTER_EVERY_TODB:COMMIT;',
'-e', 'BEFORE_INDEXES:BEGIN TRANSACTION;',
'e', 'END:COMMIT;']
if '--mysql-innodb' in sys.argv[1:]: if '--mysql-innodb' in sys.argv[1:]:
sys.argv += MYSQLINNODB_OPTS sys.argv += MYSQLINNODB_OPTS
if '--mysql-force-myisam' in sys.argv[1:]:
sys.argv += MYSQLFORCEMYISAM_OPTS
if '--ms-sqlserver' in sys.argv[1:]: if '--ms-sqlserver' in sys.argv[1:]:
sys.argv += SQLSERVER_OPTS sys.argv += SQLSERVER_OPTS
if '--sqlite-transactions' in sys.argv[1:]: if '--sqlite-transactions' in sys.argv[1:]:
...@@ -107,6 +117,7 @@ try: ...@@ -107,6 +117,7 @@ try:
['uri=', 'data=', 'execute=', ['uri=', 'data=', 'execute=',
'mysql-innodb', 'ms-sqlserver', 'mysql-innodb', 'ms-sqlserver',
'sqlite-transactions', 'sqlite-transactions',
'mysql-force-myisam',
'help']) 'help'])
except getopt.error, e: except getopt.error, e:
print 'Troubles with arguments.' print 'Troubles with arguments.'
...@@ -128,11 +139,13 @@ for opt in optlist: ...@@ -128,11 +139,13 @@ for opt in optlist:
continue continue
if when == 'BEFORE_EVERY_TODB': if when == 'BEFORE_EVERY_TODB':
for nw in ('BEFORE_MOVIES_TODB', 'BEFORE_PERSONS_TODB', for nw in ('BEFORE_MOVIES_TODB', 'BEFORE_PERSONS_TODB',
'BEFORE_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB'): 'BEFORE_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB',
'BEFORE_CHARACTERS_TODB'):
CUSTOM_QUERIES.setdefault(nw, []).append(cmd) CUSTOM_QUERIES.setdefault(nw, []).append(cmd)
elif when == 'AFTER_EVERY_TODB': elif when == 'AFTER_EVERY_TODB':
for nw in ('AFTER_MOVIES_TODB', 'AFTER_PERSONS_TODB', for nw in ('AFTER_MOVIES_TODB', 'AFTER_PERSONS_TODB',
'AFTER_SQLDATA_TODB', 'AFTER_AKAMOVIES_TODB'): 'AFTER_SQLDATA_TODB', 'AFTER_AKAMOVIES_TODB',
'AFTER_CHARACTERS_TODB'):
CUSTOM_QUERIES.setdefault(nw, []).append(cmd) CUSTOM_QUERIES.setdefault(nw, []).append(cmd)
else: else:
CUSTOM_QUERIES.setdefault(when, []).append(cmd) CUSTOM_QUERIES.setdefault(when, []).append(cmd)
...@@ -150,6 +163,51 @@ if URI is None: ...@@ -150,6 +163,51 @@ if URI is None:
print HELP print HELP
sys.exit(2) sys.exit(2)
# Some warnings and notices.
URIlower = URI.lower()
if URIlower.startswith('mysql'):
if '--mysql-force-myisam' in sys.argv[1:] and \
'--mysql-innodb' in sys.argv[1:]:
print '\nWARNING: there is no sense in mixing the --mysql-innodb and\n'\
'--mysql-force-myisam command line options!\n'
elif '--mysql-innodb' in sys.argv[1:]:
print "\nNOTICE: you've specified the --mysql-innodb command line\n"\
"option; you should do this ONLY IF your system uses InnoDB\n"\
"tables or you really want to use InnoDB; if you're running\n"\
"a MyISAM-based database, please omit any option; if you\n"\
"want to force MyISAM usage on a InnoDB-based database,\n"\
"try the --mysql-force-myisam command line option, instead.\n"
elif '--mysql-force-myisam' in sys.argv[1:]:
print "\nNOTICE: you've specified the --mysql-force-myisam command\n"\
"line option; you should do this ONLY IF your system uses\n"\
"InnoDB tables and you want to use MyISAM tables, instead.\n"
else:
print "\nNOTICE: IF you're using InnoDB tables, data insertion can\n"\
"be very slow; you can switch to MyISAM tables - forcing it\n"\
"with the --mysql-force-myisam option - OR use the\n"\
"--mysql-innodb command line option, but DON'T USE these if\n"\
"you're already working on MyISAM tables, because it will\n"\
"force MySQL to use InnoDB, and performances will be poor.\n"
elif URIlower.startswith('mssql') and \
'--ms-sqlserver' not in sys.argv[1:]:
print "\nWARNING: you're using MS SQLServer without the --ms-sqlserver\n"\
"command line option: if something goes wrong, try using it.\n"
elif URIlower.startswith('sqlite') and \
'--sqlite-transactions' not in sys.argv[1:]:
print "\nWARNING: you're using SQLite without the --sqlite-transactions\n"\
"command line option: you'll have very poor performances! Try\n"\
"using it.\n"
if ('--mysql-force-myisam' in sys.argv[1:] and
not URIlower.startswith('mysql')) or ('--mysql-innodb' in
sys.argv[1:] and not URIlower.startswith('mysql')) or ('--ms-sqlserver'
in sys.argv[1:] and not URIlower.startswith('mssql')) or \
('--sqlite-transactions' in sys.argv[1:] and
not URIlower.startswith('sqlite')):
print "\nWARNING: you've specified command line options that don't\n"\
"belong to the database server you're using: proceed at your\n"\
"own risk!\n"
# Connect to the database. # Connect to the database.
conn = setConnection(URI) conn = setConnection(URI)
# Extract exceptions to trap. # Extract exceptions to trap.
...@@ -488,7 +546,7 @@ class _BaseCache(dict): ...@@ -488,7 +546,7 @@ class _BaseCache(dict):
"""Flush to the database.""" """Flush to the database."""
if self._flushing: return if self._flushing: return
self._flushing = 1 self._flushing = 1
if _recursionLevel >= 5: if _recursionLevel >= MAX_RECURSION:
print 'WARNING recursion level exceded trying to flush data' print 'WARNING recursion level exceded trying to flush data'
print 'WARNING this batch of data is lost (%s).' % self.className print 'WARNING this batch of data is lost (%s).' % self.className
self._tmpDict.clear() self._tmpDict.clear()
...@@ -674,6 +732,7 @@ class MoviesCache(_BaseCache): ...@@ -674,6 +732,7 @@ class MoviesCache(_BaseCache):
class PersonsCache(_BaseCache): class PersonsCache(_BaseCache):
"""Manage the persons list.""" """Manage the persons list."""
className = 'PersonsCache'
counter = counter() counter = counter()
def __init__(self, *args, **kwds): def __init__(self, *args, **kwds):
...@@ -794,6 +853,7 @@ class SQLData(dict): ...@@ -794,6 +853,7 @@ class SQLData(dict):
self.sqlString = sqlString self.sqlString = sqlString
self.converter = converter self.converter = converter
self._recursionLevel = 1 self._recursionLevel = 1
self._table = table
self._table_name = tableName(table) self._table_name = tableName(table)
for k, v in d.items(): self[k] = v for k, v in d.items(): self[k] = v
...@@ -817,7 +877,7 @@ class SQLData(dict): ...@@ -817,7 +877,7 @@ class SQLData(dict):
CACHE_MID.flush(quiet=1) CACHE_MID.flush(quiet=1)
CACHE_PID.flush(quiet=1) CACHE_PID.flush(quiet=1)
if _resetRecursion: self._recursionLevel = 1 if _resetRecursion: self._recursionLevel = 1
if self._recursionLevel >= 5: if self._recursionLevel >= MAX_RECURSION:
print 'WARNING recursion level exceded trying to flush data' print 'WARNING recursion level exceded trying to flush data'
print 'WARNING this batch of data is lost.' print 'WARNING this batch of data is lost.'
self.clear() self.clear()
...@@ -841,7 +901,8 @@ class SQLData(dict): ...@@ -841,7 +901,8 @@ class SQLData(dict):
print ' * TOO MANY DATA (%s items), SPLITTING (run #%d)...' % \ print ' * TOO MANY DATA (%s items), SPLITTING (run #%d)...' % \
(len(self), self._recursionLevel) (len(self), self._recursionLevel)
self._recursionLevel += 1 self._recursionLevel += 1
newdata = self.__class__(sqlString=self.sqlString, newdata = self.__class__(table=self._table,
sqlString=self.sqlString,
converter=self.converter) converter=self.converter)
newdata._recursionLevel = self._recursionLevel newdata._recursionLevel = self._recursionLevel
newflushEvery = self.flushEvery / 2 newflushEvery = self.flushEvery / 2
...@@ -997,7 +1058,7 @@ def doCast(fp, roleid, rolename): ...@@ -997,7 +1058,7 @@ def doCast(fp, roleid, rolename):
pass pass
movieid = CACHE_MID.addUnique(title) movieid = CACHE_MID.addUnique(title)
if role is not None: if role is not None:
roles = role.split('/') roles = filter(None, [x.strip() for x in role.split('/')])
for role in roles: for role in roles:
cid = CACHE_CID.addUnique(role) cid = CACHE_CID.addUnique(role)
sqldata.add((pid, movieid, cid, note, order)) sqldata.add((pid, movieid, cid, note, order))
...@@ -1011,8 +1072,13 @@ def doCast(fp, roleid, rolename): ...@@ -1011,8 +1072,13 @@ def doCast(fp, roleid, rolename):
print 'CLOSING %s...' % rolename print 'CLOSING %s...' % rolename
def castLists(): def castLists(_charIDsList=None):
"""Read files listed in the 'role' column of the 'roletypes' table.""" """Read files listed in the 'role' column of the 'roletypes' table."""
# _charIDsList is a dirty hack to allow us to get rid of
# CACHE_CID as soon as possible (right after that actresses.list.gz
# is processed).
if _charIDsList is None:
_charIDsList = []
for rt in RoleType.select(): for rt in RoleType.select():
roleid = rt.id roleid = rt.id
rolename = fname = rt.role rolename = fname = rt.role
...@@ -1027,12 +1093,22 @@ def castLists(): ...@@ -1027,12 +1093,22 @@ def castLists():
f = SourceFile(fname, start=CAST_START, stop=CAST_STOP) f = SourceFile(fname, start=CAST_START, stop=CAST_STOP)
except IOError: except IOError:
if rolename == 'actress': if rolename == 'actress':
try:
restoreImdbID(_charIDsList, CharName)
del _charIDsList
except Exception, e:
print 'WARNING: failed to restore imdbIDs for characters:',e
CACHE_CID.flush() CACHE_CID.flush()
CACHE_CID.clear() CACHE_CID.clear()
continue continue
doCast(f, roleid, rolename) doCast(f, roleid, rolename)
f.close() f.close()
if rolename == 'actress': if rolename == 'actress':
try:
restoreImdbID(_charIDsList, CharName)
del _charIDsList
except Exception, e:
print 'WARNING: failed to restore imdbIDs for characters:', e
CACHE_CID.flush() CACHE_CID.flush()
CACHE_CID.clear() CACHE_CID.clear()
t('castLists(%s)' % rolename) t('castLists(%s)' % rolename)
...@@ -1593,6 +1669,7 @@ def completeCast(): ...@@ -1593,6 +1669,7 @@ def completeCast():
CACHE_MID = MoviesCache() CACHE_MID = MoviesCache()
CACHE_PID = PersonsCache() CACHE_PID = PersonsCache()
CACHE_CID = CharactersCache() CACHE_CID = CharactersCache()
CACHE_CID.className = 'CharactersCache'
def _cmpfunc(x, y): def _cmpfunc(x, y):
...@@ -1632,7 +1709,8 @@ def notNULLimdbID(cls): ...@@ -1632,7 +1709,8 @@ def notNULLimdbID(cls):
"""Return a list of dictionaries for titles or names for which a """Return a list of dictionaries for titles or names for which a
imdbID is present in the database.""" imdbID is present in the database."""
if cls is Title: cname = 'movies' if cls is Title: cname = 'movies'
else: cname = 'people' elif cls is Name: cname = 'people'
else: cname = 'characters'
print 'SAVING imdbID values for %s...' % cname, print 'SAVING imdbID values for %s...' % cname,
sys.stdout.flush() sys.stdout.flush()
try: try:
...@@ -1666,9 +1744,12 @@ def restoreImdbID(tons, cls): ...@@ -1666,9 +1744,12 @@ def restoreImdbID(tons, cls):
if cls is Title: if cls is Title:
CACHE = CACHE_MID CACHE = CACHE_MID
cname = 'movies' cname = 'movies'
else: elif cls is Name:
CACHE = CACHE_PID CACHE = CACHE_PID
cname = 'people' cname = 'people'
else:
CACHE = CACHE_CID
cname = 'characters'
print 'RESTORING imdbID values for %s...' % cname, print 'RESTORING imdbID values for %s...' % cname,
sys.stdout.flush() sys.stdout.flush()
count = 0 count = 0
...@@ -1737,14 +1818,19 @@ def run(): ...@@ -1737,14 +1818,19 @@ def run():
# Storing imdbIDs for movies and persons. # Storing imdbIDs for movies and persons.
try: try:
movies_imdbIDs = notNULLimdbID(Title) movies_imdbIDs = notNULLimdbID(Title)
except: except Exception, e:
movies_imdbIDs = [] movies_imdbIDs = []
print 'WARNING: failed to read imdbIDs for movies' print 'WARNING: failed to read imdbIDs for movies: %s' % e
try: try:
people_imdbIDs = notNULLimdbID(Name) people_imdbIDs = notNULLimdbID(Name)
except: except Exception, e:
people_imdbIDs = [] people_imdbIDs = []
print 'WARNING: failed to read imdbIDs for people' print 'WARNING: failed to read imdbIDs for people: %s' % e
try:
characters_imdbIDs = notNULLimdbID(CharName)
except Exception, e:
characters_imdbIDs = []
print 'WARNING: failed to read imdbIDs for characters: %s' % e
executeCustomQueries('BEFORE_DROP') executeCustomQueries('BEFORE_DROP')
...@@ -1775,13 +1861,15 @@ def run(): ...@@ -1775,13 +1861,15 @@ def run():
# Comment readMovieList() and uncomment the following two lines # Comment readMovieList() and uncomment the following two lines
# to keep the current info in the name and title tables. # to keep the current info in the name and title tables.
##CACHE_MID.populate() ##CACHE_MID.populate()
##CACHE_PID.populate()
t('readMovieList()') t('readMovieList()')
# actors, actresses, producers, writers, cinematographers, composers, # actors, actresses, producers, writers, cinematographers, composers,
# costume-designers, directors, editors, miscellaneous, # costume-designers, directors, editors, miscellaneous,
# production-designers. # production-designers.
castLists() castLists(_charIDsList=characters_imdbIDs)
##CACHE_PID.populate()
##CACHE_CID.populate()
del characters_imdbIDs
# Aka names and titles. # Aka names and titles.
doAkaNames() doAkaNames()
...@@ -1824,13 +1912,13 @@ def run(): ...@@ -1824,13 +1912,13 @@ def run():
try: try:
restoreImdbID(movies_imdbIDs, Title) restoreImdbID(movies_imdbIDs, Title)
del movies_imdbIDs del movies_imdbIDs
except: except Exception, e:
print 'WARNING: failed to restore imdbIDs for movies' print 'WARNING: failed to restore imdbIDs for movies: %s' % e
try: try:
restoreImdbID(people_imdbIDs, Name) restoreImdbID(people_imdbIDs, Name)
del people_imdbIDs del people_imdbIDs
except: except Exception, e:
print 'WARNING: failed to restore imdbIDs for people' print 'WARNING: failed to restore imdbIDs for people: %s' % e
# Flush caches. # Flush caches.
CACHE_MID.flush() CACHE_MID.flush()
......
...@@ -15,12 +15,24 @@ I'd like to thank the following people for their help: ...@@ -15,12 +15,24 @@ I'd like to thank the following people for their help:
* Jesper Nøhr for a lot of testing, especially on 'sql'. * Jesper Nøhr for a lot of testing, especially on 'sql'.
* Mark Armendariz for a bug report about too long field in MySQL db
and some tests/analyses.
* Alexy Khrabrov, for a report about a subtle bug in imdbpy2sql.py.
* Clark Bassett for bug reports and fixes about the imdbpy2sql.py
script and the cutils.c C module.
* mumas for reporting a bug in summary methods.
* Ken R. Garland for a bug report about 'cover url' and a lot of * Ken R. Garland for a bug report about 'cover url' and a lot of
other hints. other hints.
* Steven Ovits for hints and tests with Microsoft SQL Server, SQLExpress * Steven Ovits for hints and tests with Microsoft SQL Server, SQLExpress
and preliminary work on supporting diff files. and preliminary work on supporting diff files.
* Fredrik Arnell for tests and bug reports about the imdbpy2sql.py script.
* Arnab for a bug report in the imdbpy2sql.py script. * Arnab for a bug report in the imdbpy2sql.py script.
* Elefterios Stamatogiannakis for the hint about transactions and SQLite, * Elefterios Stamatogiannakis for the hint about transactions and SQLite,
......
Changelog for IMDbPY Changelog for IMDbPY
==================== ====================
* What's the new in release 3.5 "Blade Runner" (19 Apr 2008)
[general]
- first changes to work on Symbian mobile phones.
- now there is an imdb.available_access_systems() function, that can
be used to get a list of available data access systems.
- it's possible to pass 'results' as a parameter of the imdb.IMDb
function; it sets the number of results to return for queries.
- fixed summary() method in Movie and Person, to correctly handle
unicode chars.
- the helpers.makeObject2Txt function now supports recursion over
dictionaries.
- cutils.c MXLINELEN increased from 512 to 1024; some critical
strcpy replaced with strncpy.
- fixed configuration parser to be compatible with Python 2.2.
- updated list of articles and some stats in the comments.
- documentation updated.
[sql]
- fixed minor bugs in imdbpy2sql.py.
- restores imdbIDs for characters.
- now CharactersCache honors custom queries.
- the imdbpy2sql.py's --mysql-force-myisam command line option can be
used to force usage of MyISAM tables on InnoDB databases.
- added some warnings to the imdbpy2sql.py script.
[local]
- fixed a bug in the fall-back function used to scan movie titles,
when the cutils module is not available.
- mini biographies are cut up to 2**16-1 chars, to prevent troubles
with some MySQL servers.
- fixed bug in characters4local.py, dealing with some garbage in the files.
* What's the new in release 3.4 "Flatliners" (16 Dec 2007) * What's the new in release 3.4 "Flatliners" (16 Dec 2007)
[general] [general]
- *** NOTE FOR PACKAGERS *** in the docs directory there is the - *** NOTE FOR PACKAGERS *** in the docs directory there is the
......
...@@ -7,7 +7,7 @@ database; this required some substantial changes to how actors' ...@@ -7,7 +7,7 @@ database; this required some substantial changes to how actors'
and acresses' roles were handled. and acresses' roles were handled.
Starting with release 3.4, "local" and "sql" data access systems Starting with release 3.4, "local" and "sql" data access systems
are supported, too - but they work a bit differently from "http" are supported, too - but they work a bit differently from "http"
and "mobile". See "MOBILE AND LOCAL" below. and "mobile". See "SQL AND LOCAL" below.
The currentRole instance attribute can be found in every instance The currentRole instance attribute can be found in every instance
of Person, Movie and Character classes, even if actually the Character of Person, Movie and Character classes, even if actually the Character
...@@ -50,14 +50,16 @@ will return a good-old-unicode string, like expected in the previous ...@@ -50,14 +50,16 @@ will return a good-old-unicode string, like expected in the previous
version of IMDbPY. version of IMDbPY.
MOBILE AND LOCAL SQL AND LOCAL
================ =============
Fetching data from the web, only characters with an active page Fetching data from the web, only characters with an active page
on the web site will have their characterID; we don't have these on the web site will have their characterID; we don't have these
information accessing "sql" and "local", so _every_ character information accessing "sql" and "local", so _every_ character
will have an associated characterID. will have an associated characterID.
This way, every character with the same name will share the same ID. This way, every character with the same name will share the same
characterID, even if - in fact - they may not be portraying the
same character.
For "local", to activate support for characters, you have to For "local", to activate support for characters, you have to
run the characters4local.py script, specifying the directory run the characters4local.py script, specifying the directory
......
...@@ -8,8 +8,10 @@ file. ...@@ -8,8 +8,10 @@ file.
Obviously you can still prefer to use the 'local' data access Obviously you can still prefer to use the 'local' data access
system if you're already using the moviedb program. system if you're already using the moviedb program.
NOTE: see README.currentRole for information about character support. NOTE: see README.currentRole for information about character support;
to put it simple, after you've installed everything you can use the
characters4local.py script to generate files for characters (it will
required some time).
Select a mirror of the "The Plain Text Data Files" from Select a mirror of the "The Plain Text Data Files" from
the http://www.imdb.com/interfaces.html page and download the http://www.imdb.com/interfaces.html page and download
...@@ -25,8 +27,9 @@ NOTE: the current (3.24) moviedb version is old an it was not ...@@ -25,8 +27,9 @@ NOTE: the current (3.24) moviedb version is old an it was not
thought with tv series episodes support in mind. thought with tv series episodes support in mind.
It can still work very well, but you've to modify some constants It can still work very well, but you've to modify some constants
in the code: edit the "moviedb.h" file in the "src" directory, in the code: edit the "moviedb.h" file in the "src" directory,
and change MAXTITLES to _at least_ 1400000, MAXNAKAENTRIES and change MAXTITLES to _at least_ 1600000, MAXNAKAENTRIES
to 700000 and LINKSTART to 1000000. to 700000, MAXFILMOGRAPHIES to 20470 and LINKSTART to 1000000.
Also, setting MXLINELEN to 1023 is a good idea.
See http://us.imdb.com/database_statistics for more up-to-date See http://us.imdb.com/database_statistics for more up-to-date
statistics. statistics.
......
...@@ -84,8 +84,8 @@ is available at: ...@@ -84,8 +84,8 @@ is available at:
On some mobile phone a pair of modules can be missing, and On some mobile phone a pair of modules can be missing, and
you have to install it manually as libraries; you can find you have to install it manually as libraries; you can find
these two modules (sgmllib.py and htmlentitydefs.py) here: these modules (sgmllib.py, htmlentitydefs.py and ConfigParser.py) here:
http://imdbpy.sourceforge.net/symbiangui/mobile-imdbpy-modules-0.1.tar.gz http://imdbpy.sourceforge.net/?page=mobile
THE "HTTPTHIN" DATA ACCESS SYSTEM THE "HTTPTHIN" DATA ACCESS SYSTEM
......
...@@ -89,7 +89,7 @@ The fastest database appears to be MySQL, with about 95 minutes to ...@@ -89,7 +89,7 @@ The fastest database appears to be MySQL, with about 95 minutes to
complete on my test system (read below). complete on my test system (read below).
A lot of memory (RAM or swap space) is required, in the range of A lot of memory (RAM or swap space) is required, in the range of
at least 150/200 megabytes (plus more for the database server). at least 150/200 megabytes (plus more for the database server).
In the end, the database will require between 1.5GB and 3GB of disc space. In the end, the database will require between 2.5GB and 5GB of disc space.
As said, the performances varies greatly using a database server or another: As said, the performances varies greatly using a database server or another:
MySQL, for instance, has an executemany() method of the cursor object MySQL, for instance, has an executemany() method of the cursor object
...@@ -98,21 +98,18 @@ database requires a call to the execute() method for every single row ...@@ -98,21 +98,18 @@ database requires a call to the execute() method for every single row
of data, and they will be much slower - from 2 to 7 times slower than of data, and they will be much slower - from 2 to 7 times slower than
MySQL. MySQL.
I've done some tests, using an AMD Athlon 1800+, 512MB of RAM, over a I've done some tests, using an AMD Athlon 1800+, 1GB of RAM, over a
complete plain text data files set (as of 12 Nov 2006, with about complete plain text data files set (as of 11 Apr 2008, with more than
890.000 titles and over 2.000.000 names): 1.200.000 titles and over 2.200.000 names):
database | time in minutes: total (insert data/create indexes) database | time in minutes: total (insert data/create indexes)
----------------------+----------------------------------------------------- ----------------------+-----------------------------------------------------
MySQL 5.0 MyISAM | 115 (95/20) MySQL 5.0 MyISAM | 205 (160/45)
MySQL 5.0 InnoDB | ??? (80/???) MySQL 5.0 InnoDB | _untested_, see NOTES below.
| maybe I've not cofigurated it properly: it PostgreSQL 8.1 | 560 (530/30)
| looks like the creation of the indexes will SQLite 3.3 | ??? (150/???) - very slow building indexes.
| take more than 2 or 3 hours. But see NOTES below. | Timed with the "--sqlite-transactions" command
PostgreSQL 8.1 | 190 (177/13) | line option; otherwise it's _really_ slow: even
SQLite 3.2 | ??? (80/???)
| with the "--sqlite-transactions" command line
| option; otherwise it's _really_ slow: even
| 35 hours or more. | 35 hours or more.
SQL Server | about 3 or 4 hours. SQL Server | about 3 or 4 hours.
...@@ -127,12 +124,24 @@ The imdbpy2sql.py will print a lot of debug information on standard output; ...@@ -127,12 +124,24 @@ The imdbpy2sql.py will print a lot of debug information on standard output;
you can save it in a file, appending (without quotes) "2>&1 | tee output.txt" you can save it in a file, appending (without quotes) "2>&1 | tee output.txt"
[MySQL InnoDB] [MySQL]
In general, if you get an embarrassingly high numbero of "TOO MANY DATA
... SPLITTING" lines, consider increasing max_allowed_packet (in the
configuration of your MySQL server) to at least 8M or 16M.
Otherwise, inserting the data will be very slow, and some data may
be lost.
[MySQL InnoDB and MyISAM]
InnoDB is abysmal slow for our purposes: my suggestion is to always InnoDB is abysmal slow for our purposes: my suggestion is to always
use MyISAM tables and - if you really want to use InnoDB - convert use MyISAM tables and - if you really want to use InnoDB - convert
the tables later. the tables later.
The imdbpy2sql.py script provides a simple way to manage this case, The imdbpy2sql.py script provides a simple way to manage these cases,
see ADVANCED FEATURES below. see ADVANCED FEATURES below.
In my opinion, the cleaner thing to do is to set the server to use
MyISAM tables or - you you can't modifiy the server - use the
--mysql-force-myisam command line option of imdbpy2sql.py.
Anyway, if you really need to use InnoDB, in the server-side settings Anyway, if you really need to use InnoDB, in the server-side settings
I recommend to set innodb_file_per_table to "true". I recommend to set innodb_file_per_table to "true".
...@@ -232,9 +241,9 @@ or BEFORE_CREATE time...), replacing the "%(table)s" text in the QUERY ...@@ -232,9 +241,9 @@ or BEFORE_CREATE time...), replacing the "%(table)s" text in the QUERY
with the appropriate table name. with the appropriate table name.
Other available TIMEs are: 'BEFORE_MOVIES_TODB', 'AFTER_MOVIES_TODB', Other available TIMEs are: 'BEFORE_MOVIES_TODB', 'AFTER_MOVIES_TODB',
'BEFORE_PERSONS_TODB', 'AFTER_PERSONS_TODB', 'BEFORE_SQLDATA_TODB', 'BEFORE_PERSONS_TODB', 'AFTER_PERSONS_TODB', 'BEFORE_CHARACTERS_TODB',
'AFTER_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB' and 'AFTER_AKAMOVIES_TODB'; 'AFTER_CHARACTERS_TODB', 'BEFORE_SQLDATA_TODB', 'AFTER_SQLDATA_TODB',
they take no modifiers. 'BEFORE_AKAMOVIES_TODB' and 'AFTER_AKAMOVIES_TODB'; they take no modifiers.
Special TIMEs 'BEFORE_EVERY_TODB' and 'AFTER_EVERY_TODB' apply to Special TIMEs 'BEFORE_EVERY_TODB' and 'AFTER_EVERY_TODB' apply to
every BEFORE_* and AFTER_* TIME above mentioned. every BEFORE_* and AFTER_* TIME above mentioned.
These commands are executed before and after every _toDB() call in These commands are executed before and after every _toDB() call in
......
...@@ -32,8 +32,9 @@ ...@@ -32,8 +32,9 @@
accessSystem = http accessSystem = http
# Optional: # Optional:
#proxy = http://localhost:8080/