Commit b63f50d4 authored by Ana Guerrero López's avatar Ana Guerrero López

Import Upstream version 3.6

parent 8cf69e1c
......@@ -2,8 +2,8 @@
"""
characters4local.py script.
This script creates some files to manage characters' information
for the 'local' data access system.
This script creates some files to access characters' information
from the 'local' data access system.
Copyright 2007-2008 Davide Alberani <da@erlug.linux.it>
......@@ -135,7 +135,6 @@ def writeData(d, directory):
char2id = anydbm.open(os.path.join(directory, 'character2id.index'), 'n')
findex = open(os.path.join(directory, 'characters.index'), 'wb')
fdata = open(os.path.join(directory, 'characters.data'), 'wb')
fdatawrite = fdata.write
fdatawritelines = fdata.writelines
fdatatell = fdata.tell
fkey = open(os.path.join(directory, 'characters.key'), 'wb')
......@@ -143,7 +142,6 @@ def writeData(d, directory):
offsetList = []
offsetListappend = offsetList.append
dpopitem = d.popitem
dpop = d.pop
print 'Writing characters.key file...',
sys.stdout.flush()
fkey.writelines('%s|%x\n' % (name, d[name][0]) for name in sorted(d))
......
#!/usr/bin/env python
"""
companies4local.py script.
This script creates some files to access companies' information
from the 'local' data access system.
Copyright 2008 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import sys, os, anydbm
from struct import pack
HELP = """companies4local.py usage:
%s /directory/with/local/files/
# NOTE: you need read and write access to the specified directory.
See README.companies for more information.
""" % sys.argv[0]
if len(sys.argv) != 2:
print 'Specify the target directory!'
print HELP
sys.exit(1)
DATA_DIR = sys.argv[1]
def _buildChrIntsList(length):
"""Auxiliary table for fast conversion from 3-bytes strings to integers."""
chi = []
for j in xrange(length):
chi.append({})
for count in xrange(256):
val = count << j*8L
if val <= sys.maxint: val = int(val)
chi[j][chr(count)] = val
return chi
_chr_ints = _buildChrIntsList(3)
def toDec(v):
""""Convert string v to integer."""
value = 0
for i in xrange(len(v)):
value |= _chr_ints[i][v[i]]
return value
def toBin3(v):
"""Return a string (little-endian) from a numeric value."""
return '%s%s%s' % (chr(v & 255), chr((v >> 8) & 255), chr((v >> 16) & 255))
# Dictionary used to store companies information in the
# format: {'companyName': (companyID, (personID, movieID, ...))}
CACHE_CID = {}
def doCompanies(dataF, code, compCount=0):
"""Read the dataF file and populate the CACHE_CID dictionary."""
print 'Start reading file %s.' % dataF
try:
fptr = open(dataF, 'rb')
except IOError, e:
print 'ERROR: unable to read file "%s"; skipping it: %s' % (dataF, e)
return compCount
fread = fptr.read
while True:
piddata = fread(3)
if len(piddata) != 3:
break
# The movieID we're managing.
movieID = toDec(piddata)
length = ord(fread(1))
curComp = fread(length)
if curComp in CACHE_CID:
CACHE_CID[curComp][1].append((code, movieID))
else:
CACHE_CID[curComp] = (compCount, [(code, movieID)])
compCount += 1
fread(3)
fptr.close()
print 'File %s closed.' % dataF
return compCount
def counter(value=0):
while True:
yield value
value += 1
def writeData(d, directory):
"""Write d data into files in the specified directory."""
# Open files.
print 'Start writing data to directory %s.' % directory
comp2id = anydbm.open(os.path.join(directory, 'company2id.index'), 'n')
findex = open(os.path.join(directory, 'companies.index'), 'wb')
fdata = open(os.path.join(directory, 'companies.data'), 'wb')
fdatawritelines = fdata.writelines
fdatatell = fdata.tell
fkey = open(os.path.join(directory, 'companies.key'), 'wb')
# Auxiliary list used to store offsets in the fdata file.
offsetList = []
offsetListappend = offsetList.append
dpopitem = d.popitem
print 'Writing companies.key file...',
sys.stdout.flush()
fkey.writelines('%s|%x\n' % (name, d[name][0]) for name in sorted(d))
fkey.close()
print 'DONE!'
print 'Converting received dictionary...',
sys.stdout.flush()
# Convert the received dictionary in another format, simpler/faster
# to process. It's faster and requires less memory than a
# sorted(d.iteritems(), key=operator.itemgetter(1)) call.
d2 = {}
while True:
try:
name, (compID, items) = dpopitem()
except KeyError:
break
d2[compID] = (name, items)
# Probably this won't free-up any memory space, but...
d = {}
print 'DONE!'
count = 1
d2pop = d2.pop
print 'Start writing data (this may take a while).'
for compID in sorted(d2):
name, items = d2pop(compID)
offsetListappend(fdatatell())
compIDBin = toBin3(compID)
# Map company names to companyIDs.
comp2id[name] = compIDBin
fdatawritelines((compIDBin, # companyID: superfluous,
# but useful for run-time checks.
pack('<H', len(name)), # Length of the name (2-bytes).
name, # Name of the company.
toBin3(len(items)))) # Number of 4-bytes-long items to
# read.
for kind, movieID in items:
fdatawritelines((chr(kind), toBin3(movieID)))
if count % 50000 == 0:
print '* So far, %d companies were written.' % count
count += 1
fdata.close()
comp2id.close()
print 'Writing the companies.index file...',
sys.stdout.flush()
findex.writelines(pack('<L', x) for x in offsetList)
findex.close()
print 'DONE!'
print 'Dump to directory %s complete.' % directory
lastCC = 0
for fname, code in (('distributors.data', 0),
('production-companies.data', 1),
('special-effects-companies.data', 2),
('miscellaneous-companies.data', 3)):
lastCC = doCompanies(os.path.join(DATA_DIR, fname), code, compCount=lastCC)
# Write output files.
writeData(CACHE_CID, DATA_DIR)
#!/usr/bin/env python
"""
get_company.py
Usage: get_company "companyID"
Show some info about the company with the given imdbID (e.g. '0071509'
for "Columbia Pictures [us]".
"""
import sys
# Import the IMDbPY package.
try:
import imdb
except ImportError:
print 'You bad boy! You need to install the IMDbPY package!'
sys.exit(1)
if len(sys.argv) != 2:
print 'Only one argument is required:'
print ' %s "imdbID"' % sys.argv[0]
sys.exit(2)
imdbID = sys.argv[1]
i = imdb.IMDb()
out_encoding = sys.stdout.encoding or sys.getdefaultencoding()
try:
# Get a company object with the data about the company identified by
# the given imdbID.
company = i.get_company(imdbID)
except imdb.IMDbError, e:
print "Probably you're not connected to Internet. Complete error report:"
print e
sys.exit(3)
if not company:
print 'It seems that there\'s no company with imdbID "%s"' % imdbID
sys.exit(4)
# XXX: this is the easier way to print the main info about a company;
# calling the summary() method of a company object will returns a string
# with the main information about the company.
# Obviously it's not really meaningful if you want to know how
# to access the data stored in a company object, so look below; the
# commented lines show some ways to retrieve information from a
# company object.
print company.summary().encode(out_encoding, 'replace')
#!/usr/bin/env python
"""
get_first_company.py
Usage: get_first_company "company name"
Search for the given name and print the best matching result.
"""
import sys
# Import the IMDbPY package.
try:
import imdb
except ImportError:
print 'You bad boy! You need to install the IMDbPY package!'
sys.exit(1)
if len(sys.argv) != 2:
print 'Only one argument is required:'
print ' %s "company name"' % sys.argv[0]
sys.exit(2)
name = sys.argv[1]
i = imdb.IMDb()
in_encoding = sys.stdin.encoding or sys.getdefaultencoding()
out_encoding = sys.stdout.encoding or sys.getdefaultencoding()
name = unicode(name, in_encoding, 'replace')
try:
# Do the search, and get the results (a list of company objects).
results = i.search_company(name)
except imdb.IMDbError, e:
print "Probably you're not connected to Internet. Complete error report:"
print e
sys.exit(3)
if not results:
print 'No matches for "%s", sorry.' % name.encode(out_encoding, 'replace')
sys.exit(0)
# Print only the first result.
print ' Best match for "%s"' % name.encode(out_encoding, 'replace')
# This is a company instance.
company = results[0]
# So far the company object only contains basic information like the
# name; retrieve main information:
i.update(company)
print company.summary().encode(out_encoding, 'replace')
This diff is collapsed.
#!/usr/bin/env python
"""
misc-companies4local.py script.
This script creates some files to access miscellaneous companies'
information from the 'local' data access system.
Copyright 2008 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import os
import sys
from array import array
from gzip import GzipFile
from struct import pack
from itertools import izip, chain
HELP = """misc-companies4local.py usage:
%s /directory/with/plain/text/data/files/ /directory/with/local/files/
# NOTE: you need read and write access to the second directory.
""" % sys.argv[0]
if len(sys.argv) != 3:
print 'Specify both source and target directories!'
print HELP
sys.exit(1)
# Directory containing the IMDb's Plain Text Data Files.
IMDB_PTDF_DIR = sys.argv[1]
LOCAL_DATA_DIR = sys.argv[2]
MISC_START = ('MISCELLANEOUS COMPANY LIST', '==========================')
MISC_COMP_FILE = 'miscellaneous-companies.list.gz'
MISC_COMP_IDX = 'miscellaneous-companies.index'
MISC_COMP_KEY = 'miscellaneous-companies.key'
MISC_COMP_DATA = 'miscellaneous-companies.data'
MISC_STOP = '---------------'
TITLES_KEY_FILE = 'titles.key'
ATTRS_KEY_FILE = 'attributes.key'
ATTRS_IDX_FILE = 'attributes.index'
GzipFileRL = GzipFile.readline
class SourceFile(GzipFile):
"""Instances of this class are used to read gzipped files,
starting from a defined line to a (optionally) given end."""
def __init__(self, filename=None, mode=None, start=(), stop=None,
pwarning=1, *args, **kwds):
filename = os.path.join(IMDB_PTDF_DIR, filename)
try:
GzipFile.__init__(self, filename, mode, *args, **kwds)
except IOError, e:
if not pwarning: raise
print 'WARNING WARNING WARNING'
print 'WARNING unable to read the "%s" file.' % filename
print 'WARNING The file will be skipped, and the contained'
print 'WARNING information will NOT be stored in the database.'
print 'WARNING Complete error: ', e
# re-raise the exception.
raise
self.start = start
for item in start:
itemlen = len(item)
for line in self:
if line[:itemlen] == item: break
self.set_stop(stop)
def set_stop(self, stop):
if stop is not None:
self.stop = stop
self.stoplen = len(self.stop)
self.readline = self.readline_checkEnd
else:
self.readline = self.readline_NOcheckEnd
def readline_NOcheckEnd(self, size=-1):
line = GzipFile.readline(self, size)
return unicode(line, 'latin_1').encode('utf_8')
def readline_checkEnd(self, size=-1):
line = GzipFile.readline(self, size)
if self.stop is not None and line[:self.stoplen] == self.stop: return ''
return line
##return unicode(line, 'latin_1').encode('utf_8')
def getIDs(keyFile):
"""Return a dictionary mapping values to IDs, as taken from a .key
plain text data file."""
theDict = {}
dataF = open(os.path.join(LOCAL_DATA_DIR, keyFile), 'r')
for line in dataF:
lsplit = line.split('|')
if len(lsplit) != 2:
continue
data, idHex = lsplit
theDict[data] = int(idHex, 16)
dataF.close()
return theDict
def toBin3(v):
"""Return a string (little-endian) from a numeric value."""
return '%s%s%s' % (chr(v & 255), chr((v >> 8) & 255), chr((v >> 16) & 255))
def doMiscCompanies(MOVIE_IDS, ATTR_IDS):
"""Scan the miscellaneous-companies.list.gz file, creates the
miscellaneous-companies.(index|data) files and updates the
attributes.(index|key) files."""
MOVIE_IDSget = MOVIE_IDS.get
ATTR_IDSget = ATTR_IDS.get
AUX_ATTR_IDS = {}
AUX_ATTR_IDSget = AUX_ATTR_IDS.get
currAttrID = len(ATTR_IDS)
COMPANIES_IDS = {}
COMPANIES_IDSget = COMPANIES_IDS.get
AUX_MOVIE_IDS = {}
AUX_MOVIE_IDSpop = AUX_MOVIE_IDS.pop
currCompanyID = 0
print 'INFO: The first attributeID used will be %d.' % currAttrID
print 'Reading the miscellaneous-companies.list.gz file...',
sys.stdout.flush()
miscCompF = SourceFile(os.path.join(IMDB_PTDF_DIR, MISC_COMP_FILE),
start=MISC_START, stop=MISC_STOP)
for line in miscCompF:
linesplit = filter(None, line.rstrip().split('\t'))
lslen = len(linesplit)
if lslen == 2:
linesplit.append('')
elif lslen != 3:
print 'WARN: discarding line: "%s"' % line
continue
movie, company, attr = linesplit
movieID = MOVIE_IDSget(movie)
if movieID is None:
# Prevent some inconsistencies with movies.list.gz.
print 'WARN: unable to find movieID for "%s"' % movie
continue
# First, check in the dictionary we're building: attributes
# seem to be very specific.
if attr:
attrID = AUX_ATTR_IDSget(attr)
else:
attrID = 0xffffff
if attrID is None:
# Check the main list of attributes.
attrID = ATTR_IDSget(attr)
if attrID is None:
attrID = currAttrID
AUX_ATTR_IDS[attr] = attrID
currAttrID += 1
companyID = COMPANIES_IDSget(company)
if companyID is None:
companyID = currCompanyID
COMPANIES_IDS[company] = companyID
currCompanyID += 1
if movieID not in AUX_MOVIE_IDS:
# Movies to be added to the miscellaneous-companies.data file.
AUX_MOVIE_IDS[movieID] = array('I', (companyID, attrID))
else:
AUX_MOVIE_IDS[movieID].extend((companyID, attrID))
miscCompF.close()
print 'DONE!'
print 'INFO: %d companies, %d attributes read.' % (len(COMPANIES_IDS),
len(AUX_ATTR_IDS))
# Invert COMPANIES_IDS dictionary.
COMPANIES_IDS = dict(izip(COMPANIES_IDS.itervalues(),
COMPANIES_IDS.iterkeys()))
# Auxiliary list used to store offsets in the fdata file.
offsetList = []
offsetListappend = offsetList.append
fdata = open(os.path.join(LOCAL_DATA_DIR, MISC_COMP_DATA), 'wb')
fdatawritelines = fdata.writelines
fdatatell = fdata.tell
# Create the miscellaneous-companies.data file.
print 'Creating the miscellaneous-companies.data file...',
sys.stdout.flush()
for movieID in sorted(AUX_MOVIE_IDS):
items = AUX_MOVIE_IDSpop(movieID)
offsetListappend((movieID, fdatatell()))
for companyID, attrID in izip(*[chain(items, [0xffffff])]*2):
companyName = COMPANIES_IDS[companyID][:255]
fdatawritelines((toBin3(movieID),
chr(len(companyName)),
companyName,
toBin3(attrID)))
fdata.close()
print 'DONE!'
print 'Writing the miscellaneous-companies.index file...',
sys.stdout.flush()
findex = open(os.path.join(LOCAL_DATA_DIR, MISC_COMP_IDX), 'wb')
findex.writelines('%s%s' % (toBin3(movieID), toBin3(ftell))
for movieID, ftell in offsetList)
findex.close()
print 'DONE!'
del AUX_MOVIE_IDS
del COMPANIES_IDS
offsetList = []
offsetListappend = offsetList.append
print 'Updating the attributes.key file...',
sys.stdout.flush()
akeyF = open(os.path.join(LOCAL_DATA_DIR, ATTRS_KEY_FILE), 'ab')
aidxF = open(os.path.join(LOCAL_DATA_DIR, ATTRS_IDX_FILE), 'ab')
AUX_ATTR_IDS = dict(izip(AUX_ATTR_IDS.itervalues(),
AUX_ATTR_IDS.iterkeys()))
for attrID in sorted(AUX_ATTR_IDS):
attr = AUX_ATTR_IDS[attrID]
offsetListappend(akeyF.tell())
akeyF.write('%s|%x\n' % (attr, attrID))
akeyF.close()
print 'DONE!'
print 'Updating the attributes.index file...',
sys.stdout.flush()
aidxF.writelines(pack('<L', x) for x in offsetList)
aidxF.close()
print 'DONE!'
if __name__ == '__main__':
movieIDsDict = getIDs(TITLES_KEY_FILE)
attrsIDsDict = getIDs(ATTRS_KEY_FILE)
doMiscCompanies(movieIDsDict, attrsIDsDict)
#!/usr/bin/env python
"""
mpaa4local.py script.
This script creates some files to access mpaa's information
from the 'local' data access system.
Copyright 2008 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import os
import sys
import gzip
HELP = """companies4local.py usage:
%s /directory/with/plain/text/data/files/ /directory/with/local/files/
# NOTE: you need read and write access to the second directory.
""" % sys.argv[0]
if len(sys.argv) != 3:
print 'Specify both source and target directories!'
print HELP
sys.exit(1)
# Directory containing the IMDb's Plain Text Data Files.
IMDB_PTDF_DIR = sys.argv[1]
LOCAL_DATA_DIR = sys.argv[2]
def getIDs(keyFile):
"""Return a dictionary mapping values to IDs, as taken from a .key
plain text data file."""
theDict = {}
dataF = open(keyFile, 'r')
for line in dataF:
lsplit = line.split('|')
if len(lsplit) != 2:
continue
data, idHex = lsplit
theDict[data] = int(idHex, 16)
dataF.close()
return theDict
def toBin3(v):
"""Return a string (little-endian) from a numeric value."""
return '%s%s%s' % (chr(v & 255), chr((v >> 8) & 255), chr((v >> 16) & 255))
def doMPAA():
"""Process mpaa information."""
MOVIE_IDS = getIDs(os.path.join(LOCAL_DATA_DIR, 'titles.key'))
mpaaF = open(os.path.join(LOCAL_DATA_DIR, 'mpaa-ratings-reasons.data'), 'r')
offsetList = []
curOffset = 0L
# NOTE: DON'T use "for line in file", since a read buffer will
# result in wrong tell() numbers.
line = mpaaF.readline()
while line:
if not line.startswith('MV: '):
line = mpaaF.readline()
continue
title = line[4:].strip()
movieID = MOVIE_IDS.get(title)
if movieID is None:
print 'WARN: skipping movie %s.' % title
line = mpaaF.readline()
continue
curOffset = mpaaF.tell() - len(line)
offsetList.append((movieID, curOffset))
line = mpaaF.readline()
mpaaF.close()
offsetList.sort()
idxF = open(os.path.join(LOCAL_DATA_DIR, 'mpaa-ratings-reasons.index'),'wb')
idxF.writelines('%s%s' % (toBin3(movieID), toBin3(ftell))
for movieID, ftell in offsetList)
idxF.close()
mpaaFileGZ = gzip.open(os.path.join(IMDB_PTDF_DIR,
'mpaa-ratings-reasons.list.gz'))
mpaaFileOut = open(os.path.join(LOCAL_DATA_DIR,
'mpaa-ratings-reasons.data'), 'w')
for aLine in mpaaFileGZ:
mpaaFileOut.write(aLine)
mpaaFileOut.close()
mpaaFileGZ.close()
doMPAA()
#!/usr/bin/env python
"""
search_company.py
Usage: search_company "company name"
Search for the given name and print the results.
"""
import sys
# Import the IMDbPY package.
try:
import imdb
except ImportError:
print 'You bad boy! You need to install the IMDbPY package!'
sys.exit(1)
if len(sys.argv) != 2:
print 'Only one argument is required:'
print ' %s "company name"' % sys.argv[0]
sys.exit(2)
name = sys.argv[1]
i = imdb.IMDb()
in_encoding = sys.stdin.encoding or sys.getdefaultencoding()
out_encoding = sys.stdout.encoding or sys.getdefaultencoding()
name = unicode(name, in_encoding, 'replace')
try:
# Do the search, and get the results (a list of company objects).