Commit 8bc38665 authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 4.0

parent ccbaffd4
#!/usr/bin/env python
"""
get_keyword.py
Usage: get_keyword "keyword"
search for movies tagged with the given keyword and print the results.
"""
import sys
# Import the IMDbPY package.
try:
import imdb
except ImportError:
print 'You bad boy! You need to install the IMDbPY package!'
sys.exit(1)
if len(sys.argv) != 2:
print 'Only one argument is required:'
print ' %s "keyword"' % sys.argv[0]
sys.exit(2)
name = sys.argv[1]
i = imdb.IMDb()
in_encoding = sys.stdin.encoding or sys.getdefaultencoding()
out_encoding = sys.stdout.encoding or sys.getdefaultencoding()
name = unicode(name, in_encoding, 'replace')
try:
# Do the search, and get the results (a list of movies).
results = i.get_keyword(name, results=20)
except imdb.IMDbError, e:
print "Probably you're not connected to Internet. Complete error report:"
print e
sys.exit(3)
# Print the results.
print ' %s result%s for "%s":' % (len(results),
('', 's')[len(results) != 1],
name.encode(out_encoding, 'replace'))
print ' : movie title'
# Print the long imdb title for every movie.
for idx, movie in enumerate(results):
outp = u'%d: %s' % (idx+1, movie['long imdb title'])
print outp.encode(out_encoding, 'replace')
#!/usr/bin/env python
"""
get_top_bottom_movies.py
Usage: get_top_bottom_movies
Return top and bottom 10 movies, by ratings.
"""
import sys
# Import the IMDbPY package.
try:
import imdb
except ImportError:
print 'You bad boy! You need to install the IMDbPY package!'
sys.exit(1)
if len(sys.argv) != 1:
print 'No arguments are required.'
sys.exit(2)
i = imdb.IMDb()
top250 = i.get_top250_movies()
bottom100 = i.get_bottom100_movies()
out_encoding = sys.stdout.encoding or sys.getdefaultencoding()
for label, ml in [('top 10', top250[:10]), ('bottom 10', bottom100[:10])]:
print ''
print '%s movies' % label
print 'rating\tvotes\ttitle'
for movie in ml:
outl = u'%s\t%s\t%s' % (movie.get('rating'), movie.get('votes'),
movie['long imdb title'])
print outl.encode(out_encoding, 'replace')
This diff is collapsed.
#!/usr/bin/env python
"""
search_keyword.py
Usage: search_keyword "keyword"
Search for keywords similar to the give one and print the results.
"""
import sys
# Import the IMDbPY package.
try:
import imdb
except ImportError:
print 'You bad boy! You need to install the IMDbPY package!'
sys.exit(1)
if len(sys.argv) != 2:
print 'Only one argument is required:'
print ' %s "keyword name"' % sys.argv[0]
sys.exit(2)
name = sys.argv[1]
i = imdb.IMDb()
in_encoding = sys.stdin.encoding or sys.getdefaultencoding()
out_encoding = sys.stdout.encoding or sys.getdefaultencoding()
name = unicode(name, in_encoding, 'replace')
try:
# Do the search, and get the results (a list of keyword strings).
results = i.search_keyword(name, results=20)
except imdb.IMDbError, e:
print "Probably you're not connected to Internet. Complete error report:"
print e
sys.exit(3)
# Print the results.
print ' %s result%s for "%s":' % (len(results),
('', 's')[len(results) != 1],
name.encode(out_encoding, 'replace'))
print ' : keyword'
# Print every keyword.
for idx, keyword in enumerate(results):
outp = u'%d: %s' % (idx+1, keyword)
print outp.encode(out_encoding, 'replace')
#!/usr/bin/env python
"""
topbottom4local.py script.
This script creates some files to access top 250/bottom 10 information
from the 'local' data access system.
Copyright 2009 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import os
import sys
import gzip
import shelve
HELP = """topbottom4local.py usage:
%s /directory/with/plain/text/data/files/ /directory/with/local/files/
# NOTE: you need read and write access to the second directory.
""" % sys.argv[0]
if len(sys.argv) != 3:
print 'Specify both source and target directories!'
print HELP
sys.exit(1)
# Directory containing the IMDb's Plain Text Data Files.
IMDB_PTDF_DIR = sys.argv[1]
LOCAL_DATA_DIR = sys.argv[2]
beforeTop = 'TOP 250 MOVIES'
beforeBottom = 'BOTTOM 10 MOVIES'
beforeList = 'New'
def getIDs(keyFile):
"""Return a dictionary mapping values to IDs, as taken from a .key
plain text data file."""
theDict = {}
dataF = open(keyFile, 'r')
for line in dataF:
lsplit = line.split('|')
if len(lsplit) != 2:
continue
data, idHex = lsplit
theDict[data] = int(idHex, 16)
dataF.close()
return theDict
def toBin3(v):
"""Return a string (little-endian) from a numeric value."""
return '%s%s%s' % (chr(v & 255), chr((v >> 8) & 255), chr((v >> 16) & 255))
print 'Reading titles.key...',
sys.stdout.flush()
MOVIEIDS = getIDs(os.path.join(LOCAL_DATA_DIR, 'titles.key'))
print 'DONE!'
def manageLine(l):
"""Extract information from a single line of the lists."""
ls = filter(None, l.split(' '))
if len(ls) != 4:
return None
distrib, votes, rating, title = ls
distrib = unicode(distrib)
votes = int(votes)
rating = float(rating)
movieID = MOVIEIDS.get(title)
if movieID is None:
return None
return {'votes distribution': distrib, 'votes': votes, 'rating': rating,
'movieID': movieID}
def getLines(fd, before):
"""Retrieve information from the lists."""
seenFirst = False
seenSecond = False
lines = []
for line in fd:
if seenSecond:
line = line.strip()
if not line:
break
info = manageLine(line)
if info:
lines.append(info)
continue
if seenFirst:
if line.startswith(beforeList):
seenSecond = True
continue
if line.startswith(before):
seenFirst = True
return lines
def saveList():
"""Save information from the top/bottom lists."""
fd = gzip.open(os.path.join(IMDB_PTDF_DIR, 'ratings.list.gz'))
outShlv = shelve.open(os.path.join(LOCAL_DATA_DIR, 'topbottom.db'), 'n')
print 'Saving top 250 list...',
sys.stdout.flush()
top = getLines(fd, beforeTop)
outShlv['top 250 rank'] = top
print 'DONE!'
print 'Saving bottom 10 list...',
sys.stdout.flush()
fd.seek(0)
bottom = getLines(fd, beforeBottom)
bottom.reverse()
outShlv['bottom 10 rank'] = bottom
print 'DONE!'
fd.close()
outShlv.close()
saveList()
......@@ -6,7 +6,8 @@ NAME: H. Turgut Uyar
EMAIL: <uyar --> tekir.org>
CONTRIBUTION: the whole new "http" data access system (using a DOM and
XPath-based approach) is based on his work. The imdbpykit interface
is wholly copyrighted by him.
was mostly written by him and he holds the copyright over the whole
code (with some portions shared with others).
NAME: Giuseppe "Cowo" Corbelli
......
......@@ -13,6 +13,8 @@ I'd like to thank the following people for their help:
* H. Turgut Uyar for a number of bug reports and a lot of work on
the test-suite.
* Hieu Nguyen for a bug report about fetching real imdbIDs.
* Rdian06 for a patch for movies without plot authors.
* Jesper Nøhr for a lot of testing, especially on 'sql'.
......@@ -25,6 +27,12 @@ I'd like to thank the following people for their help:
of testing and debugging of the ibm_db driver (plus a lot of hints
about how to improve the imdbpy2sql.py script).
* Chris Thompson for some bug reports about summary() methods.
* Ori Cohen for some code and various hints.
* Mike Castle for performace tests with SQLite and numerous hints.
* Indy (indyx) for a bug about series cast parsing using BeautifulSoup.
* Yoav Aviram for a bug report about tv mini-series.
......
Changelog for IMDbPY
====================
* What's the new in release 4.0 "Watchmen" (12 Mar 2009)
[general]
- the installer is now based on setuptools.
- new functions get_keyword and search_keyword to handle movie's keywords
(example scripts included).
- Movie/Person/... keys (and whole instances) can be converted to XML.
- two new functions, get_top250_movies and get_bottom100_movies, to
retrieve lists of best/worst movies (example scripts included).
- searching for movies and persons - if present - the 'akas' keyword
is filled, in the results.
- 'quotes' for movies is now always a list of lists.
- the old set of parsers (based on sgmllib.SGMLParser) are gone.
- fixed limitations handling multiple roles (with notes).
- fixed a bug converting somethingIDs to real imdbIDs.
- fixed some summary methods.
- updates to the documentation.
[http]
- adapted BeautifulSoup to lxml (internally, the lxml API is used).
- currentRole is no longer populated, for non-cast entries (everything
ends up into .notes).
- fixed a bug search for too common terms.
- fixed a bug identifying 'kind', searching for titles.
- fixed a bug parsing airing dates.
- fixed a bug searching for company names (when there's a direct hit).
- fixed a bug handling multiple characters.
- fixed a bug parsing episode ratings.
- nicer keys for technical details.
- removed the 'agent' page.
[sql]
- searching for a movie, the original titles are returned, instead
of AKAs.
- support for Foreign Keys.
- minor changes to the db's design.
- fixed a bug populating tables with SQLAlchemy.
- imdbpy2sql.py shows user time and system time, along with wall time.
[local]
- searching for a movie, the original titles are returned, instead
of AKAs.
* What's the new in release 3.9 "The Strangers" (06 Jan 2009)
[general]
- introduced the search_episode method, to search for episodes' titles.
......
......@@ -11,7 +11,7 @@ NOTE: IMDbPY includes an unmodified version of BeautifulSoup,
under a New-style BSD license.
Copyright 2004-2008 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......
......@@ -22,16 +22,16 @@ inside a list of movies in which a given character was portrayed.
Schema:
movie['cast'][0].currentRole -> a Character object.
|
+-> a Person object.
|
+-> a Person object.
person['actor'][0].currentRole -> a Character object.
|
+-> a Movie object.
|
+-> a Movie object.
character['filmography'][0].currentRole -> a Person object.
|
+-> a Movie object.
|
+-> a Movie object.
The roleID attribute can be used to access/set the characterID
or personID instance attribute of the current currentRole.
......
......@@ -35,11 +35,15 @@ imdb (package)
| +-> searchPersonParser
| +-> searchCharacterParser
| +-> searchCompanyParser
| +-> searchKeywordParser
| +-> topBottomParser
| +-> utils
| +-> bsoupadapter.py
| +-> _bsoup.py
| +-> bsoupxpath.py
| +-> lxmladapter.py
| +-> bsouplxml
| |
| +-> _bsoup.py
| +-> etree.py
| +-> html.py
| +-> bsoupxpath.py
|
+-> local (package)
| |
......@@ -97,11 +101,17 @@ http.searchCharacterParser: parse an html string, result of a query for a
character name.
http.searchCompanyParser: parse an html string, result of a query for a
company name.
http.searchKeywordParser: parse an html string, result of a query for a keyword.
http.topBottomParser: parse an html string, result of a query for top250
and bottom100 movies.
http.utils: miscellaneous utilities used only by the http package.
http._bsoup: just a copy of the BeautifulSoup module, so that it's not
an external dependency.
http.bsoupadapter, http.bsoupxpath and http.lxmladapter: adapters for
BeautifulSoup and lxml.
http.bsouplxml (package): adapter to make BeautifulSoup behave like lxml
(internally, the API of lxml is always used).
http.bsouplxml._bsoup: just a copy of the BeautifulSoup module, so that it's not
an external dependency.
http.bsouplxml.etree: adapter for the lxml.etree module.
http.bsouplxml.html: adapter for the lxml.html module.
http.bsouplxml.bsoupxpath: xpath support for beautifulsoup.
The modules under the parser.local package are the same of the
parser.http package (the search functions are placed directly in the
......@@ -112,7 +122,8 @@ The parser.sql package manages the access to the data in the SQL
database, created with the imdbpy2sql.py script; see the README.sqldb file.
The dbschema module contains tables definitions and some useful functions;
The alchemyadapter adapts the SQLAlchemy ORM to the internal mechanisms
of IMDbPY, and the objectadapter does the same for the SQLObject ORM.
of IMDbPY, and the objectadapter does the same for the SQLObject ORM
(internally the API of SQLObject is always used).
The class in the parser.mobile package is a subclass of the one found
in parser.http, with some method overridden to be many times faster (from
......@@ -192,6 +203,10 @@ of the imdb.IMDb class which must define at least the following methods:
information defined for a company object;
should return a dictionary with the relative
information.
_get_top_bottom_movies(kind) - kind can be one of 'top' and 'bottom';
returns the related list of movies.
_get_keyword(keyword) - return a list of Movie objects with the given keyword.
_search_keyword(key) - return a list of keywords similar to the given key.
get_imdbMovieID(movieID) - must convert the given movieID to a string
representing the imdbID, as used by the IMDb web
server (e.g.: '0094226' for Brian De Palma's
......
INFORMATION IN XML FORMAT
=========================
Since version 4.0, IMDbPY can output information of Movie, Person,
Character and Company instances in XML format.
It's possible to get a single information (a key) in XML format,
using the getAsXML(key) method (it will return None if the key is
not found).
E.g.:
from imdb import IMDb
ia = IMDb('http')
movie = ia.get_movie(theMovieID)
print movie.getAsXML('keywords')
It's also possible to get a representation of a whole object,
using the asXML() method:
print movie.asXML()
The returned strings are unicode.
XML FORMAT
==========
Keywords are converted to tags, items in lists are enclosed in
a 'item' tag. E.g.:
<keywords>
<item>a keyword</item>
<item>another keyword</item>
<keywords>
Except when keys are known to be not fixed (e.g.: a list of keywords),
in which case this schema is used:
<item title="EscapedKeyword">
...
</item>
Movie, Person, Character and Company instances are converted like
that (portions enclosed in squares are optionals):
<movie id="movieID" access-system="accessSystem">
<title>A Long IMDb Movie Title (YEAR)</title>
[<current-role>
<person id="personID" access-system="accessSystem">
<name>Name Surname</name>
[<notes>A Note About The Person</notes>]
</person>
</current-role>]
[<notes>A Note About The Movie</notes>]
</movie>
Every 'id' can be empty.
Actually the returned XML is mostly not pretty-printed.
REFERENCES
==========
Some text keys can contain references to other movies, persons
and characters. The user can provide the defaultModFunct function (see
the "MOVIE TITLES AND PERSON/CHARACTER NAMES REFERENCES" section of
the README.package file), to replace these references with their own
strings (e.g.: a link to a web page); it's up to the user, to be sure
that the output of the defaultModFunct function is valid XML.
KEYWORDS
========
Since version 4.0, it's possible (for every data access system) to
search for movies' keywords.
People's keywords are not supported.
SEARCH FOR A KEYWORD SIMILAR TO A GIVEN STRING
==============================================
The search_keyword(unicode_string) can be used to search amongst
keywords: a list of keywords similar to the given string will be
returned, sorted by similarity. Notice that the keywords in the
returned list are plain unicode strings, and not instances of
some class (like the ones returned by other search_SOMETHING methods).
E.g.:
from imdb import IMDb
ia = IMDb('http')
print ia.search_keyword(u'alabama')
GET A LIST OF MOVIES FOR A GIVEN KEYWORD
========================================
To get a list of movies that are tagged with the given keyword,
use the get_keyword(unicode_string) method.
E.g.:
from imdb import IMDb
ia = IMDb('http')
print ia.get_keyword(u'alabama')
Beware that by default the list is limited to 100 movies, and
it's not possible to get more results, using 'http'.
Moreover, the lists returned using 'sql' and 'local' are not
sorted in any way.
Another limit is that actually (as of february 2009), the IMDb's
web server is unable to serve pages about non-ascii keywords.
It's a known problem of their systems.
......@@ -27,6 +27,9 @@ and 'mpaa4local.py' scripts.
After that, you can also add support for companies' information:
just run the 'companies4local.py' script!
If you want to access top250/bottom100 lists, run the 'topbottom4local.py'
script.
INSTRUCTIONS
============
......@@ -46,7 +49,8 @@ thought with tv series episodes support in mind.
It can still work very well, but you've to modify some constants
in the code: edit the "moviedb.h" file in the "src" directory,
and change MAXTITLES to _at least_ 1600000, MAXNAKAENTRIES
to 700000, MAXFILMOGRAPHIES to 20470 and LINKSTART to 1000000.
to 700000, MAXFILMOGRAPHIES to 20470, LINKSTART to 1000000
and MAXBIOENTRIES to 500000.
Also, setting MXLINELEN to 1023 is a good idea.
See http://us.imdb.com/database_statistics for more up-to-date
statistics.
......
......@@ -20,36 +20,30 @@ Please read all the following section.
INSTALLATION OPTIONS
====================
You can modify some variables in the setup.py script:
setting DO_LOCAL to 0, the parser.local package is not installed;
You can call the setup.py script with some arguments:
Using --without-local, the parser.local package is not installed;
this is useful if your system doesn't have the space (more than 600Mb)
for a local installation of the IMDb's database.
Preventing the installation of the parser.local package will also save
at least 120Kb of disk space on your system.
The DO_SQL variable, if set to 0, will excludes the parser.sql
The --without-sql argument, if used, will excludes the parser.sql
package; you don't need it if your system does not have any of the
SQLObject or SQLAlchemy packages and/or you don't want to store the
whole IMDb's plain text database files in a SQL database.
If both DO_LOCAL and DO_SQL are set to 0, the parser.common package
is not installed, and the "cutils" C module is not compiled.
If the DO_SCRIPTS variable is set to 0, the example scripts in
the ./bin/ directory are not installed, saving about 65Kb.
If both --without-local and --without-sql are used, the parser.common
package is not installed, and the "cutils" C module is not compiled.
See README.txt, section
Now, if you're installing IMDbPY (using ./setup.py install), you
should take a look at some options, like "--no-compile" and "-O0"
to exclude pyc (saving at least 450Kb of disk space) and pyo
(other 400Kb) files.
to exclude pyc and pyo files, saving hundreds of KBs.
Moreover, if you're creating a package (rpm, deb or whatever),
in the setup.cfg you can exclude from your package things
like the documentation (more than 200Kb) and the icon (~4Kb).
In both situations (installing and packaging), you may also need
to modify the MANIFEST.in file.
like the documentation (more than 200Kb) and the scripts in the
./bin/ directory.
THE "MOBILE" DATA ACCESS SYSTEM
......
......@@ -130,6 +130,12 @@ The search_person(name), get_person(personID) search_character(name)
get_character(characterID), search_company(name) and get_company(companyID)
methods work the same way as search_movie(title) and get_movie(movieID).
The search_keyword(string) method returns a list of unicode string that are
valid keywords, similar to the one given.
The get_keyword(keyword) method returns a list of Movie instances that
are tagged with the given keyword.
For more information see README.keywords.
The get_imdbMovieID(movieID), get_imdbPersonID(personID),
get_imdbCharacterID(characterID) and get_imdbCompanyID(companyID) take,
respectively, a movieID, a personID, a movieID and a companyID and return
......@@ -369,6 +375,16 @@ i.get_movie_infoset(), i.get_person_infoset(), i.get_character_infoset()
and i.get_company_infoset().
TOP250 / BOTTOM100 LISTS
========================
Since IMDbPY 4.0, it's possible to retrieve the list of top250
and bottom100 movies.
Use the get_top250_movies() and get_bottom100_movies() methods.
Beware that for 'sql' and 'local', the bottom100 list is
limited to the first 10 results. See README.local if you're using 'local'.
Person OBJECTS INSIDE A Movie CLASS AND Movie OBJECTS INSIDE A Person OBJECT
============================================================================
......
......@@ -93,7 +93,7 @@ to your database, with the schema:
scheme://[user[:password]@]host[:port]/database[?parameters]
Where 'scheme' is one in "sqlite", "mysql", "postgres", "firebird",
"interbase", "maxdb", "sapdb", "mssql", "sybase".
"interbase", "maxdb", "sapdb", "mssql", "sybase", "ibm_db_sa".
Some examples:
mysql://user:password@host/database
......@@ -115,7 +115,10 @@ separated values to specify an order of preference).
======
The performances are hugely dependant upon the underlying Python
module/package used to access the database.
module/package used to access the database. The imdbpy2sql.py script
has a number of command line arguments, useful to chose amongst
presets that can improve performances, using specific database servers.
The fastest database appears to be MySQL, with about 200 minutes to
complete on my test system (read below).
A lot of memory (RAM or swap space) is required, in the range of
......@@ -131,6 +134,14 @@ database requires a call to the execute() method for every single row
of data, and they will be much slower - from 2 to 7 times slower than
MySQL.
There are generic suggestions that can lead to better performances,
like turning off your filesystem journaling. Another option is
the use of a ramdisk/tmpfs, if you have enough RAM. Obviously these
have effect only at insert-time: during the day-to-day use, you can
turn your journaling on again. You can also consider the use of
the CSV output, explained below (but be sure that your database
server of choice is able to import CSV files).
I've done some tests, using an AMD Athlon 1800+, 1GB of RAM, over a
complete plain text data files set (as of 11 Apr 2008, with more than
1.200.000 titles and over 2.200.000 names):
......@@ -200,12 +211,17 @@ writing this).
For some reason, SQLite is really slow, except when used with
transactions; you can use the '--sqlite-transactions' command
line option to obtain acceptable performances.
The same command, also turns off "PRAGMA synchronous".
SQLite seems to hugely benefit from the use of a non-journaling
filesystem and/or of a ramdisk/tmpfs: see the generic suggestions
discussed above in the TIMING section.
[SQLite failure]
It seems that, with older versions of the python-sqlite package, the first
run may fail; if you get a DatabaseError exception saying "no such table",
try running again the command with the same arguments.
try running again the command with the same arguments. Double funny, uh? ;-)
[data truncated]
......
......@@ -36,7 +36,9 @@ Everything you need to do is to run, as the root user, the command:
If, for some reason, it doesn't work, you can copy the "./imdb"
directory in the local site-packages directory of the python
major version you're using.
major version you're using, but remember that you'll not satisfy
the required dependencies and neither compile the optional C module,
so use this as your very last resort.
To know what major version of python you've installed, run:
$ python -V
......@@ -45,7 +47,6 @@ the major version is "2.3".
Now copy the "./imdb" directory: