Skip to content
Commits on Source (5)
Release 1.22 - 07/29/2019
* NOTE: Known regression: PDFBOX-4587 -- PDF passwords with codepoints
between 0xF000 and 0XF0000 will cause an exception.
* Add parser for HWP v5 files via SooMyung Lee (soomyung) and
JinSup Kim (ddoleye) (TIKA-2909).
* Fix order of closing streams to avoid "Failed to close temporary resource"
exception in TesseractOCRParser (TIKA-2908).
* Improve AutoDetectReader performance by caching encoding
detector (TIKA-1568).
* Prevent RTFParser from outputting illegal tag combinations (TIKA-2889).
* Fix RereadableInputStream to release all resources (TIKA-2903).
* Implement custom language identifier in the tika-eval module based on
OpenNLP's language detector; add 18 languages and add common words
lists for all 121 languages (TIKA-2790).
* Fix NPE in MimeTypesReader.releaseParser() via Eamonn Saunders (TIKA-2896).
* Fix RTFParser to extract more content (TIKA-2883).
* Add clientSubmitTime to the metadata extracted from PST files (TIKA-2898).
* Improve StreamingZipContainerDetector for xltx, xltm and
several other file formats (TIKA-2886).
Release 1.21 - 05/14/2019 Release 1.21 - 05/14/2019
* Add optional AUTO mode to OCR'ing of PDFs. If tesseract is installed * Add optional AUTO mode to OCR'ing of PDFs. If tesseract is installed
......
...@@ -413,3 +413,27 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ...@@ -413,3 +413,27 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
----------------------------------------------
quine.gz test file (MIT License)
https://twitter.com/WhoStoleHonno/status/1153315367235784704?s=20
Copyright (c) 2019 by Matthew Barber.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
...@@ -26,7 +26,8 @@ ...@@ -26,7 +26,8 @@
<excludes> <excludes>
<exclude>**/target/**</exclude> <exclude>**/target/**</exclude>
<exclude>**/.*/**</exclude> <exclude>**/.*/**</exclude>
<exclude>**/opennlp/*.bin</exclude> <exclude>**/opennlp/ner-*.bin</exclude>
<exclude>**/opennlp/en-*.bin</exclude>
<exclude>**/recognition/*.bin</exclude> <exclude>**/recognition/*.bin</exclude>
<exclude>**/*.releaseBackup</exclude> <exclude>**/*.releaseBackup</exclude>
</excludes> </excludes>
......
tika (1.22-1) unstable; urgency=medium
* New upstream release
- Fixes CVE-2019-10088: A carefully crafted or corrupt zip file can cause
an out of memory error in RecursiveParserWrapper (Closes: #933744)
- Fixes CVE-2019-10094: A carefully crafted package/compressed file that,
when unzipped/uncompressed yields the same file (a quine), causes a stack
overflow error in RecursiveParserWrapper (Closes: #933746)
- Fixes CVE-2019-10093: A carefully crafted 2003ml or 2006ml file could
consume all available SAXParsers in the pool and lead to very long hangs.
(Closes: #933745)
- Refreshed the patches
- Ignore the new dependency on c3p0 (not used)
-- Emmanuel Bourg <ebourg@apache.org> Mon, 05 Aug 2019 11:41:25 +0200
tika (1.21-1) unstable; urgency=medium tika (1.21-1) unstable; urgency=medium
* New upstream release * New upstream release
......
...@@ -15,6 +15,7 @@ com.healthmarketscience.jackcess jackcess * * * * ...@@ -15,6 +15,7 @@ com.healthmarketscience.jackcess jackcess * * * *
com.healthmarketscience.jackcess jackcess-encrypt * * * * com.healthmarketscience.jackcess jackcess-encrypt * * * *
com.levigo.jbig2 levigo-jbig2-imageio * * * * com.levigo.jbig2 levigo-jbig2-imageio * * * *
com.epam parso * * * * com.epam parso * * * *
com.mchange c3p0 * * * *
com.pff java-libpst * * * * com.pff java-libpst * * * *
com.sun.activation jakarta.activation * * * * com.sun.activation jakarta.activation * * * *
de.thetaphi forbiddenapis * * * * de.thetaphi forbiddenapis * * * *
......
...@@ -132,7 +132,7 @@ Forwarded: no ...@@ -132,7 +132,7 @@ Forwarded: no
<dependency> <dependency>
<groupId>org.ow2.asm</groupId> <groupId>org.ow2.asm</groupId>
<artifactId>asm</artifactId> <artifactId>asm</artifactId>
<version>7.1</version> <version>7.2-beta</version>
+ <optional>true</optional> + <optional>true</optional>
</dependency> </dependency>
<dependency> <dependency>
...@@ -156,7 +156,7 @@ Forwarded: no ...@@ -156,7 +156,7 @@ Forwarded: no
<dependency> <dependency>
<groupId>com.rometools</groupId> <groupId>com.rometools</groupId>
<artifactId>rome</artifactId> <artifactId>rome</artifactId>
<version>1.12.0</version> <version>1.12.1</version>
+ <optional>true</optional> + <optional>true</optional>
<exclusions> <exclusions>
<exclusion> <exclusion>
...@@ -206,7 +206,7 @@ Forwarded: no ...@@ -206,7 +206,7 @@ Forwarded: no
@@ -405,6 +434,7 @@ @@ -405,6 +434,7 @@
<groupId>org.apache.opennlp</groupId> <groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId> <artifactId>opennlp-tools</artifactId>
<version>1.9.1</version> <version>${opennlp.version}</version>
+ <optional>true</optional> + <optional>true</optional>
</dependency> </dependency>
...@@ -222,7 +222,7 @@ Forwarded: no ...@@ -222,7 +222,7 @@ Forwarded: no
@@ -434,11 +465,13 @@ @@ -434,11 +465,13 @@
<groupId>com.github.openjson</groupId> <groupId>com.github.openjson</groupId>
<artifactId>openjson</artifactId> <artifactId>openjson</artifactId>
<version>1.0.10</version> <version>1.0.11</version>
+ <optional>true</optional> + <optional>true</optional>
</dependency> </dependency>
<dependency> <dependency>
...@@ -236,15 +236,15 @@ Forwarded: no ...@@ -236,15 +236,15 @@ Forwarded: no
@@ -551,6 +584,7 @@ @@ -551,6 +584,7 @@
<groupId>com.google.protobuf</groupId> <groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId> <artifactId>protobuf-java</artifactId>
<version>3.7.1</version> <version>3.9.0</version>
+ <optional>true</optional> + <optional>true</optional>
</dependency> </dependency>
<dependency> <dependency>
<groupId>edu.ucar</groupId> <groupId>edu.ucar</groupId>
@@ -633,6 +667,7 @@ @@ -644,6 +678,7 @@
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId> <artifactId>commons-csv</artifactId>
<version>1.6</version> <version>1.7</version>
+ <optional>true</optional> + <optional>true</optional>
</dependency> </dependency>
......
...@@ -3,7 +3,7 @@ Author: Emmanuel Bourg <ebourg@apache.org> ...@@ -3,7 +3,7 @@ Author: Emmanuel Bourg <ebourg@apache.org>
Forwarded: not-needed Forwarded: not-needed
--- a/tika-parsers/pom.xml --- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml
@@ -962,6 +962,40 @@ @@ -974,6 +974,40 @@
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
......
...@@ -25,14 +25,14 @@ ...@@ -25,14 +25,14 @@
<parent> <parent>
<groupId>org.apache.tika</groupId> <groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId> <artifactId>tika-parent</artifactId>
<version>1.21</version> <version>1.22</version>
<relativePath>tika-parent/pom.xml</relativePath> <relativePath>tika-parent/pom.xml</relativePath>
</parent> </parent>
<artifactId>tika</artifactId> <artifactId>tika</artifactId>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>Apache Tika</name> <name>Apache Tika</name>
<url>http://tika.apache.org</url> <url>https://tika.apache.org</url>
<modules> <modules>
<module>tika-parent</module> <module>tika-parent</module>
...@@ -189,7 +189,7 @@ least three +1 Tika PMC votes are cast. ...@@ -189,7 +189,7 @@ least three +1 Tika PMC votes are cast.
</description> </description>
<organization> <organization>
<name>The Apache Software Foundation</name> <name>The Apache Software Foundation</name>
<url>http://www.apache.org</url> <url>https://www.apache.org</url>
</organization> </organization>
<issueManagement> <issueManagement>
<system>JIRA</system> <system>JIRA</system>
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
<parent> <parent>
<groupId>org.apache.tika</groupId> <groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId> <artifactId>tika-parent</artifactId>
<version>1.21</version> <version>1.22</version>
<relativePath>../tika-parent/pom.xml</relativePath> <relativePath>../tika-parent/pom.xml</relativePath>
</parent> </parent>
......
...@@ -31,6 +31,7 @@ import java.util.Map; ...@@ -31,6 +31,7 @@ import java.util.Map;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.tika.utils.ProcessUtils;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
...@@ -176,9 +177,7 @@ public class TikaCLIBatchCommandLineTest { ...@@ -176,9 +177,7 @@ public class TikaCLIBatchCommandLineTest {
boolean ex = false; boolean ex = false;
try { try {
String path = testFile.toAbsolutePath().toString(); String path = testFile.toAbsolutePath().toString();
if (path.contains(" ")) { path = ProcessUtils.escapeCommandLine(path);
path = "\"" + path + "\"";
}
String[] params = {testInputPathForCommandLine, path}; String[] params = {testInputPathForCommandLine, path};
String[] commandLine = BatchCommandLineBuilder.build(params); String[] commandLine = BatchCommandLineBuilder.build(params);
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
<parent> <parent>
<groupId>org.apache.tika</groupId> <groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId> <artifactId>tika-parent</artifactId>
<version>1.21</version> <version>1.22</version>
<relativePath>../tika-parent/pom.xml</relativePath> <relativePath>../tika-parent/pom.xml</relativePath>
</parent> </parent>
......
...@@ -170,6 +170,7 @@ public abstract class FSBatchTestBase extends TikaTest { ...@@ -170,6 +170,7 @@ public abstract class FSBatchTestBase extends TikaTest {
private String[] commandLine(String testConfig, String loggerProps, String[] args) { private String[] commandLine(String testConfig, String loggerProps, String[] args) {
List<String> commandLine = new ArrayList<>(); List<String> commandLine = new ArrayList<>();
commandLine.add("java"); commandLine.add("java");
commandLine.add("-Djava.awt.headless=true");
commandLine.add("-Dlog4j.configuration=file:"+ commandLine.add("-Dlog4j.configuration=file:"+
this.getClass().getResource(loggerProps).getFile()); this.getClass().getResource(loggerProps).getFile());
commandLine.add("-Xmx128m"); commandLine.add("-Xmx128m");
...@@ -200,6 +201,7 @@ public abstract class FSBatchTestBase extends TikaTest { ...@@ -200,6 +201,7 @@ public abstract class FSBatchTestBase extends TikaTest {
String[] args) throws Exception { String[] args) throws Exception {
List<String> commandLine = new ArrayList<>(); List<String> commandLine = new ArrayList<>();
commandLine.add("java"); commandLine.add("java");
commandLine.add("-Djava.awt.headless=true");
commandLine.add("-Xmx128m"); commandLine.add("-Xmx128m");
commandLine.add("-cp"); commandLine.add("-cp");
String cp = System.getProperty("java.class.path"); String cp = System.getProperty("java.class.path");
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
<parent> <parent>
<groupId>org.apache.tika</groupId> <groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId> <artifactId>tika-parent</artifactId>
<version>1.21</version> <version>1.22</version>
<relativePath>../tika-parent/pom.xml</relativePath> <relativePath>../tika-parent/pom.xml</relativePath>
</parent> </parent>
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
<parent> <parent>
<groupId>org.apache.tika</groupId> <groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId> <artifactId>tika-parent</artifactId>
<version>1.21</version> <version>1.22</version>
<relativePath>../tika-parent/pom.xml</relativePath> <relativePath>../tika-parent/pom.xml</relativePath>
</parent> </parent>
......
...@@ -22,6 +22,7 @@ import java.io.IOException; ...@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
...@@ -44,6 +45,13 @@ public class AutoDetectReader extends BufferedReader { ...@@ -44,6 +45,13 @@ public class AutoDetectReader extends BufferedReader {
private static final ServiceLoader DEFAULT_LOADER = private static final ServiceLoader DEFAULT_LOADER =
new ServiceLoader(AutoDetectReader.class.getClassLoader()); new ServiceLoader(AutoDetectReader.class.getClassLoader());
private static EncodingDetector DEFAULT_DETECTOR;
static {
DEFAULT_DETECTOR = new CompositeEncodingDetector(
DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
}
private static Charset detect( private static Charset detect(
InputStream input, Metadata metadata, InputStream input, Metadata metadata,
List<EncodingDetector> detectors, LoadErrorHandler handler) List<EncodingDetector> detectors, LoadErrorHandler handler)
...@@ -125,14 +133,13 @@ public class AutoDetectReader extends BufferedReader { ...@@ -125,14 +133,13 @@ public class AutoDetectReader extends BufferedReader {
public AutoDetectReader(InputStream stream, Metadata metadata) public AutoDetectReader(InputStream stream, Metadata metadata)
throws IOException, TikaException { throws IOException, TikaException {
this(stream, metadata, DEFAULT_LOADER); this(stream, metadata, DEFAULT_DETECTOR);
} }
public AutoDetectReader(InputStream stream) public AutoDetectReader(InputStream stream)
throws IOException, TikaException { throws IOException, TikaException {
this(stream, new Metadata()); this(stream, new Metadata());
} }
private static InputStream getBuffered(InputStream stream) { private static InputStream getBuffered(InputStream stream) {
if (stream.markSupported()) { if (stream.markSupported()) {
return stream; return stream;
......
...@@ -139,4 +139,7 @@ public interface Office { ...@@ -139,4 +139,7 @@ public interface Office {
Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText( Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText(
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-from-representing-email"); PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-from-representing-email");
Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate(
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+
"mapi-msg-client-submit-time");
} }
...@@ -142,9 +142,11 @@ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMe ...@@ -142,9 +142,11 @@ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMe
} catch (SAXException e) { } catch (SAXException e) {
throw new MimeTypeException("Invalid type configuration", e); throw new MimeTypeException("Invalid type configuration", e);
} finally { } finally {
if (parser != null) {
releaseParser(parser); releaseParser(parser);
} }
} }
}
public void read(Document document) throws MimeTypeException { public void read(Document document) throws MimeTypeException {
try { try {
......
...@@ -20,15 +20,19 @@ package org.apache.tika.parser; ...@@ -20,15 +20,19 @@ package org.apache.tika.parser;
import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property; import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.SecureContentHandler;
import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils; import org.apache.tika.utils.ParserUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler; import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
...@@ -221,7 +225,12 @@ public class RecursiveParserWrapper extends ParserDecorator { ...@@ -221,7 +225,12 @@ public class RecursiveParserWrapper extends ParserDecorator {
long started = System.currentTimeMillis(); long started = System.currentTimeMillis();
parserState.recursiveParserWrapperHandler.startDocument(); parserState.recursiveParserWrapperHandler.startDocument();
try { try {
getWrappedParser().parse(stream, localHandler, metadata, context); try (TikaInputStream tis = TikaInputStream.get(stream)) {
RecursivelySecureContentHandler secureContentHandler =
new RecursivelySecureContentHandler(localHandler, tis);
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
}
} catch (SAXException e) { } catch (SAXException e) {
boolean wlr = isWriteLimitReached(e); boolean wlr = isWriteLimitReached(e);
if (wlr == false) { if (wlr == false) {
...@@ -367,8 +376,14 @@ public class RecursiveParserWrapper extends ParserDecorator { ...@@ -367,8 +376,14 @@ public class RecursiveParserWrapper extends ParserDecorator {
Parser preContextParser = context.get(Parser.class); Parser preContextParser = context.get(Parser.class);
context.set(Parser.class, new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState)); context.set(Parser.class, new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState));
long started = System.currentTimeMillis(); long started = System.currentTimeMillis();
RecursivelySecureContentHandler secureContentHandler =
context.get(RecursivelySecureContentHandler.class);
//store the handler that was used before this parse
//so that you can return it back to its state at the end of this parse
ContentHandler preContextHandler = secureContentHandler.handler;
secureContentHandler.updateContentHandler(localHandler);
try { try {
super.parse(stream, localHandler, metadata, context); super.parse(stream, secureContentHandler, metadata, context);
} catch (SAXException e) { } catch (SAXException e) {
boolean wlr = isWriteLimitReached(e); boolean wlr = isWriteLimitReached(e);
if (wlr == true) { if (wlr == true) {
...@@ -390,6 +405,7 @@ public class RecursiveParserWrapper extends ParserDecorator { ...@@ -390,6 +405,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
} }
} finally { } finally {
context.set(Parser.class, preContextParser); context.set(Parser.class, preContextParser);
secureContentHandler.updateContentHandler(preContextHandler);
long elapsedMillis = System.currentTimeMillis() - started; long elapsedMillis = System.currentTimeMillis() - started;
metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler.endEmbeddedDocument(localHandler, metadata); parserState.recursiveParserWrapperHandler.endEmbeddedDocument(localHandler, metadata);
...@@ -407,7 +423,46 @@ public class RecursiveParserWrapper extends ParserDecorator { ...@@ -407,7 +423,46 @@ public class RecursiveParserWrapper extends ParserDecorator {
private ParserState(AbstractRecursiveParserWrapperHandler handler) { private ParserState(AbstractRecursiveParserWrapperHandler handler) {
this.recursiveParserWrapperHandler = handler; this.recursiveParserWrapperHandler = handler;
} }
}
private class RecursivelySecureContentHandler
extends SecureContentHandler {
private ContentHandler handler;
public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream) {
super(handler, stream);
this.handler = handler;
}
public void updateContentHandler(ContentHandler handler) {
setContentHandler(handler);
this.handler = handler;
}
/**
* Bypass the SecureContentHandler...
*
* This handler only looks at zip bomb via zip expansion.
* Users should be protected within entries against nested
* nested xml entities. We don't want to carry
* those stats _across_ entries.
*
* @param uri
* @param localName
* @param name
* @param atts
* @throws SAXException
*/
@Override
public void startElement(
String uri, String localName, String name, Attributes atts)
throws SAXException {
this.handler.startElement(uri, localName, name, atts);
}
@Override
public void endElement(
String uri, String localName, String name) throws SAXException {
this.handler.endElement(uri, localName, name);
}
} }
} }
...@@ -52,8 +52,12 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl ...@@ -52,8 +52,12 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path"); Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
private final ContentHandlerFactory contentHandlerFactory; private final ContentHandlerFactory contentHandlerFactory;
private static final int MAX_DEPTH = 100;
private final int maxEmbeddedResources; private final int maxEmbeddedResources;
private int embeddedResources = 0; private int embeddedResources = 0;
private int embeddedDepth = 0;
public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) { public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
this(contentHandlerFactory, -1); this(contentHandlerFactory, -1);
...@@ -82,6 +86,10 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl ...@@ -82,6 +86,10 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
*/ */
public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
embeddedResources++; embeddedResources++;
embeddedDepth++;
if (embeddedDepth >= MAX_DEPTH) {
throw new SAXException("Max embedded depth reached: "+embeddedDepth);
}
} }
/** /**
* This is called after parsing each embedded document. Override this * This is called after parsing each embedded document. Override this
...@@ -92,6 +100,7 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl ...@@ -92,6 +100,7 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
* @throws SAXException * @throws SAXException
*/ */
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
embeddedDepth--;
} }
/** /**
......
...@@ -39,4 +39,12 @@ public class ProcessUtils { ...@@ -39,4 +39,12 @@ public class ProcessUtils {
} }
return arg; return arg;
} }
public static String unescapeCommandLine(String arg) {
if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS &&
(arg.startsWith("\"") && arg.endsWith("\""))) {
arg = arg.substring(1,arg.length()-1);
}
return arg;
}
} }