...
 
Commits (5)
Manifest-Version: 1.0
Ant-Version: Apache Ant 1.8.4
Main-Class: org.htmlcleaner.CommandLine
Manifest-Version: 1.0
Ant-Version: Apache Ant 1.8.4
Main-Class: org.htmlcleaner.CommandLine
\ No newline at end of file
Source: libhtmlcleaner-java
Section: java
Priority: extra
Priority: optional
Maintainer: Alexandre Rossi <alexandre.rossi@gmail.com>
Build-Depends: debhelper (>= 7.0.50~), default-jdk,
maven, libmaven-javadoc-plugin-java,
libjdom1-java
Standards-Version: 3.9.2
Build-Depends: debhelper (>= 10), maven-debian-helper, javahelper
Build-Depends-Indep:
libmaven-javadoc-plugin-java,
libmaven-bundle-plugin-java,
libjdom2-java
Standards-Version: 4.1.3.0
Homepage: http://htmlcleaner.sourceforge.net/
Vcs-Browser: https://salsa.debian.org/debian/libhtmlcleaner-java
Vcs-Git: https://salsa.debian.org/debian/libhtmlcleaner-java.git
Package: libhtmlcleaner-java
Architecture: all
Depends: ${shlibs:Depends}, ${misc:Depends}, ${maven:Depends}
Depends: ${misc:Depends}, ${java:Depends}
Description: Java HTML Parser library
HtmlCleaner can be used in java code, as command line tool or as Ant task.
It is designed to be small, independent (no runtime dependencies except
......@@ -20,3 +22,19 @@ Description: Java HTML Parser library
parameters). Although the main motive was to prepare ordinary HTML for XML
processing with XPath, XQuery and XSLT, structured data produced by
HtmlCleaner may be consumed and handled in many other ways.
.
This package contains de library itself.
Package: libhtmlcleaner-java-doc
Architecture: all
Section: doc
Depends: ${misc:Depends}, ${java:Depends}
Description: Java HTML Parser library (documentation)
HtmlCleaner can be used in java code, as command line tool or as Ant task.
It is designed to be small, independent (no runtime dependencies except
JRE 1.5+), fast and flexible (its behavior is configurable through number of
parameters). Although the main motive was to prepare ordinary HTML for XML
processing with XPath, XQuery and XSLT, structured data produced by
HtmlCleaner may be consumed and handled in many other ways.
.
This package contains the documentation for the library.
debian/htmlcleaner.jar
Document: libhtmlcleaner-java-doc
Title: HtmlCleaner API Documentation
Abstract: Javadocs containing the HtmlCleaner API documentation.
Section: Programming/Java
Format: HTML
Index: /usr/share/doc/libhtmlcleaner-java/api/index.html
Files: /usr/share/doc/libhtmlcleaner-java/api/*.html
target/apidocs/* usr/share/doc/libhtmlcleaner-java/api
# List of POM files for the package
# Format of this file is:
# <path to pom file> [option]*
# where option can be:
# --ignore: ignore this POM and its artifact if any
# --ignore-pom: don't install the POM. To use on POM files that are created
# temporarily for certain artifacts such as Javadoc jars. [mh_install, mh_installpoms]
# --no-parent: remove the <parent> tag from the POM
# --package=<package>: an alternative package to use when installing this POM
# and its artifact
# --has-package-version: to indicate that the original version of the POM is the same as the upstream part
# of the version for the package.
# --keep-elements=<elem1,elem2>: a list of XML elements to keep in the POM
# during a clean operation with mh_cleanpom or mh_installpom
# --artifact=<path>: path to the build artifact associated with this POM,
# it will be installed when using the command mh_install. [mh_install]
# --java-lib: install the jar into /usr/share/java to comply with Debian
# packaging guidelines
# --usj-name=<name>: name to use when installing the library in /usr/share/java
# --usj-version=<version>: version to use when installing the library in /usr/share/java
# --no-usj-versionless: don't install the versionless link in /usr/share/java
# --dest-jar=<path>: the destination for the real jar.
# It will be installed with mh_install. [mh_install]
# --classifier=<classifier>: Optional, the classifier for the jar. Empty by default.
# --site-xml=<location>: Optional, the location for site.xml if it needs to be installed.
# Empty by default. [mh_install]
#
pom.xml --no-parent --has-package-version
org.apache.maven.plugins maven-deploy-plugin * * * *
org.apache.maven.plugins maven-gpg-plugin * * * *
org.apache.maven.plugins maven-source-plugin * * * *
org.apache.maven.wagon wagon-ftp * * * *
org.apache.maven.wagon wagon-ssh-external * * * *
junit junit jar s/4\..*/4.x/ * *
net.sourceforge.htmlcleaner htmlcleaner bundle * * *
s/ant/org.apache.ant/ * * s/.*/debian/ * *
Description: Fix build in Debian
Fix build in Debian where jdom.jar is named jdom1.jar.
Author: Alexandre Rossi <alexandre.rossi@gmail.com>
--- libhtmlcleaner-java-2.2.orig/build.xml
+++ libhtmlcleaner-java-2.2/build.xml
@@ -13,7 +13,7 @@
<!-- classpath -->
<path id="classpath.compile">
<pathelement location="${dir.lib}/ant.jar"/>
- <pathelement location="${dir.lib}/jdom.jar"/>
+ <pathelement location="${dir.lib}/jdom1.jar"/>
</path>
<!-- sources targets -->
@@ -78,4 +78,4 @@
</zip>
</target>
-</project>
\ No newline at end of file
+</project>
#!/usr/bin/make -f
# -*- makefile -*-
#
# Uncomment this to turn on verbose mode.
export DH_VERBOSE=1
# This has to be exported to make some magic below work.
export DH_OPTIONS
export ANT_ARGS=-Ddir.lib=/usr/share/java -Djarfile=debian/htmlcleaner.jar
%:
dh $@ --with javahelper
override_dh_auto_install:
ant jar
dh $@ --buildsystem=maven --with javahelper
This diff is collapsed.
Copyright (c) 2006-2013, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
Copyright (c) 2006-2015, the HTMLCleaner project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
\ No newline at end of file
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
Additional work by Amplafi. -- All rights released.
*/
package org.htmlcleaner;
public interface AttributeTransformation {
boolean satisfy(String attName, String attValue);
String getTemplate();
}
\ No newline at end of file
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
Additional work by Amplafi. -- All rights released.
*/
package org.htmlcleaner;
import java.util.regex.Pattern;
public class AttributeTransformationPatternImpl implements AttributeTransformation {
private final Pattern attNamePattern;
private final Pattern attValuePattern;
private final String template;
public AttributeTransformationPatternImpl(Pattern attNamePattern, Pattern attValuePattern, String template) {
this.attNamePattern = attNamePattern;
this.attValuePattern = attValuePattern;
this.template = template;
}
public AttributeTransformationPatternImpl(String attNamePattern, String attValuePattern, String template) {
this.attNamePattern = attNamePattern ==null?null:Pattern.compile(attNamePattern);
this.attValuePattern = attValuePattern == null? null: Pattern.compile(attValuePattern);
this.template = template;
}
public boolean satisfy(String attName, String attValue) {
if ( (attNamePattern == null || attNamePattern.matcher(attName).find()) && (attValuePattern ==null || attValuePattern.matcher(attValue).find())){
return true;
} else {
return false;
}
}
/**
* @return the template
*/
public String getTemplate() {
return template;
}
}
\ No newline at end of file
package org.htmlcleaner;
/**
* Base class for all tokens. Allows position tracking.
*
* @author Konstantin Burov (aectann@gmail.com)
*
*/
public abstract class BaseTokenImpl implements BaseToken {
private int row;
private int col;
protected BaseTokenImpl(){
}
protected BaseTokenImpl(int row, int col) {
this.row = row;
this.col = col;
}
public int getRow() {
return row;
}
public void setRow(int row) {
this.row = row;
}
public int getCol() {
return col;
}
public void setCol(int col) {
this.col = col;
}
@Override
public String toString() {
return "(line="+getRow()+", col="+getCol()+")";
}
}
/*
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* @author patmoore
*
*/
public enum BelongsTo {
HEAD_AND_BODY("all"),
HEAD("head"),
BODY("body");
private final String dbCode;
private BelongsTo(String dbCode) {
this.dbCode =dbCode;
}
/**
* @return the dbCode
*/
public String getDbCode() {
return dbCode;
}
public static BelongsTo toValue(Object value) {
BelongsTo result = null;
if ( value instanceof BelongsTo) {
result = (BelongsTo) value;
} else if ( value != null ) {
String dbCode = value.toString().trim();
for(BelongsTo belongsTo: BelongsTo.values()) {
if ( belongsTo.getDbCode().equalsIgnoreCase(dbCode) || belongsTo.name().equalsIgnoreCase(dbCode)) {
result = belongsTo;
break;
}
}
}
return result;
}
}
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.StringTokenizer;
/**
* <p>
* Browser compact XML serializer - creates resulting XML by stripping whitespaces wherever possible,
* but preserving single whitespace where at least one exists. This behaviour is well suited
* for web-browsers, which usually treat multiple whitespaces as single one, but make difference
* between single whitespace and empty text.
* </p>
*/
public class BrowserCompactXmlSerializer extends XmlSerializer {
private static final String PRE_TAG = "pre";
private static final String BR_TAG = "<br />";
private static final String LINE_BREAK = "\n";
public BrowserCompactXmlSerializer(CleanerProperties props) {
super(props);
}
@Override
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializeOpenTag(tagNode, writer, false);
TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
String tagName = tagInfo!=null? tagInfo.getName() : null;
List tagChildren = new ArrayList (tagNode.getAllChildren());
if (!isMinimizedTagSyntax(tagNode)) {
ListIterator childrenIt = tagChildren.listIterator();
while (childrenIt.hasNext()) {
Object item = childrenIt.next();
if (item != null) {
if (item instanceof ContentNode && !PRE_TAG.equals(tagName)) {
String content = ((ContentNode) item).getContent();
content = dontEscape(tagNode) ? content.replaceAll("]]>", "]]&gt;") : escapeXml(content);
content = content.replaceAll("^"+SpecialEntities.NON_BREAKABLE_SPACE+"+", " ");
content = content.replaceAll(SpecialEntities.NON_BREAKABLE_SPACE+"+$", " ");
boolean whitespaceAllowed = tagInfo != null && tagInfo.getDisplay().isLeadingAndEndWhitespacesAllowed();
boolean writeLeadingSpace = content.length() > 0 && (Character.isWhitespace(content.charAt(0))) && whitespaceAllowed;
boolean writeEndingSpace = content.length() > 1 && Character.isWhitespace(content.charAt(content.length() - 1)) && whitespaceAllowed;
content = content.trim();
if (content.length() != 0) {
boolean hasPrevContent = false;
int order = tagChildren.indexOf(item);
if (order >= 2 && childrenIt.hasNext()) {
Object prev = tagChildren.get(order-1);
hasPrevContent = isContentOrInline(prev);
}
if (writeLeadingSpace || hasPrevContent) {
writer.write(' ');
}
StringTokenizer tokenizer = new StringTokenizer(content, LINE_BREAK, true);
String prevToken = "";
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (prevToken.equals(token) && prevToken.equals(LINE_BREAK)) {
writer.write(BR_TAG);
prevToken = "";
} else if (LINE_BREAK.equals(token)) {
writer.write(' ');
} else {
writer.write(token.trim());
}
prevToken = token;
}
boolean hasFollowingContent = false;
if (childrenIt.hasNext()) {
Object next = childrenIt.next();
hasFollowingContent = isContentOrInline(next);
childrenIt.previous();
}
if (writeEndingSpace || hasFollowingContent) {
writer.write(' ');
}
} else{
childrenIt.remove();
}
} else if(item instanceof ContentNode){
String content = ((ContentNode) item).getContent();
writer.write(content);
} else if (item instanceof CommentNode) {
String content = ((CommentNode) item).getCommentedContent().trim();
writer.write(content);
} else {
((BaseToken)item).serialize(this, writer);
}
}
}
serializeEndTag(tagNode, writer, tagInfo != null && tagInfo.getDisplay().isAfterTagLineBreakNeeded());
}
}
private boolean isContentOrInline(Object node) {
boolean result = false;
if (node instanceof ContentNode) {
result = true;
} else if (node instanceof TagNode) {
TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName());
result = nextInfo != null && nextInfo.getDisplay() == Display.inline;
}
return result;
}
}
\ No newline at end of file
This diff is collapsed.
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* Contains transformation collection.
*/
public class CleanerTransformations {
private Map mappings = new HashMap();
private TagTransformation globalTransformations=new TagTransformation();
public CleanerTransformations() {
}
/**
* @param transInfos
*/
public CleanerTransformations(Map transInfos) {
updateTagTransformations(transInfos);
}
/**
* Adds specified tag transformation to the collection.
* @param tagTransformation
*/
public void addTransformation(TagTransformation tagTransformation) {
if (tagTransformation != null) {
mappings.put( tagTransformation.getSourceTag(), tagTransformation );
}
}
public void addGlobalTransformation(AttributeTransformation attributeTransformation) {
globalTransformations.addAttributePatternTransformation(attributeTransformation);
}
public boolean hasTransformationForTag(String tagName) {
return tagName != null && mappings.containsKey(tagName.toLowerCase());
}
public TagTransformation getTransformation(String tagName) {
return tagName != null ? (TagTransformation) mappings.get(tagName.toLowerCase()) : null;
}
public void updateTagTransformations(String key, String value) {
int index = key.indexOf('.');
// new tag transformation case (tagname[=destname[,preserveatts]])
if (index <= 0) {
String destTag = null;
boolean preserveSourceAtts = true;
if (value != null) {
String[] tokens = Utils.tokenize(value, ",;");
if (tokens.length > 0) {
destTag = tokens[0];
}
if (tokens.length > 1) {
preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) ||
"yes".equalsIgnoreCase(tokens[1]) ||
"1".equals(tokens[1]);
}
}
TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts);
addTransformation(newTagTrans);
} else { // attribute transformation description
String[] parts = Utils.tokenize(key, ".");
String tagName = parts[0];
TagTransformation trans = getTransformation(tagName);
if (trans != null) {
trans.addAttributeTransformation(parts[1], value);
}
}
}
public void updateTagTransformations(Map transInfos) {
Iterator iterator = transInfos.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry entry = (Map.Entry) iterator.next();
String tag = (String) entry.getKey();
String value = (String) entry.getValue();
updateTagTransformations(tag, value);
}
}
public Map<String, String> transformAttributes(String originalTagName, Map<String, String> attributes) {
TagTransformation tagTrans = getTransformation(originalTagName);
Map<String, String> results;
if ( tagTrans != null ) {
results = tagTrans.applyTagTransformations(attributes);
} else {
results = attributes;
}
return this.globalTransformations.applyTagTransformations(results);
}
public String getTagName(String tagName) {
TagTransformation tagTransformation = null;
if (hasTransformationForTag(tagName)) {
tagTransformation = getTransformation(tagName);
if (tagTransformation != null) {
return tagTransformation.getDestTag();
}
}
return tagName;
}
/**
*
*/
public void clear() {
this.mappings.clear();
}
}
\ No newline at end of file
This diff is collapsed.
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
/**
* <p>HTML comment token.</p>
*/
public class CommentNode extends BaseTokenImpl implements HtmlNode {
private String content;
public CommentNode(String content) {
this.content = content;
}
public String getCommentedContent() {
return "<!--" + content + "-->";
}
public String getContent() {
return content;
}
@Override
public String toString() {
return getCommentedContent();
}
public void serialize(Serializer serializer, Writer writer) throws IOException {
writer.write( getCommentedContent() );
}
}
\ No newline at end of file
This diff is collapsed.
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
/**
* <p>HTML text token.</p>
*/
public class ContentNode extends BaseTokenImpl implements HtmlNode {
private final String content;
private final boolean blank;
public ContentNode(String content) {
this.content = content;
this.blank = Utils.isEmptyString(this.content);
}
public String getContent() {
return content;
}
@Override
public String toString() {
return getContent();
}
public void serialize(Serializer serializer, Writer writer) throws IOException {
writer.write( content );
}
public boolean isBlank() {
return this.blank;
}
}
\ No newline at end of file
/*
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* @author patmoore
*
*/
public enum ContentType {
all("all"),
/**
* elements that have no children or content ( for example <img> ). For these elements, the check for null elements must be more than must a children/ content check.
*/
none("none"),
text("text");
private final String dbCode;
private ContentType(String dbCode) {
this.dbCode =dbCode;
}
/**
* @return the dbCode
*/
public String getDbCode() {
return dbCode;
}
public static ContentType toValue(Object value) {
ContentType result = null;
if ( value instanceof ContentType) {
result = (ContentType) value;
} else if ( value != null ) {
String dbCode = value.toString().trim();
for(ContentType contentType: ContentType.values()) {
if ( contentType.getDbCode().equalsIgnoreCase(dbCode) || contentType.name().equalsIgnoreCase(dbCode)) {
result = contentType;
break;
}
}
}
return result;
}
}
package org.htmlcleaner;
/**
* Most HTML 4 elements permitted within the BODY are classified as either
* block-level elements or inline elements. This enumeration contains
* corresponding constants to distinguish them.
*
* @author Konstantin Burov (aectann@gmail.com)
*
*/
public enum Display {
/**
* Block-level elements typically contain inline elements and other
* block-level elements. When rendered visually, block-level elements
* usually begin on a new line.
*/
block(true, false),
/**
* Inline elements typically may only contain text and other inline
* elements. When rendered visually, inline elements do not usually begin on
* a new line.
*/
inline(false, true),
/**
* The following elements may be used as either block-level elements or
* inline elements. If used as inline elements (e.g., within another inline
* element or a P), these elements should not contain any block-level
* elements.
*/
any(true, false),
/**
* Elements that are not actually inline or block, usually such elements are
* not rendered at all.
*/
none(true, false);
private boolean afterTagLineBreakNeeded;
private boolean leadingAndEndWhitespacesAllowed;
private Display(boolean afterTagLineBreakNeeded, boolean leadingAndEndWhitespacesAllowed) {
this.afterTagLineBreakNeeded = afterTagLineBreakNeeded;
this.leadingAndEndWhitespacesAllowed = leadingAndEndWhitespacesAllowed;
}
/**
* @return true to advise serializers to put line break after tags with such a display type.
*/
public boolean isAfterTagLineBreakNeeded() {
return afterTagLineBreakNeeded;
}
/**
* @return true if tag contents can have single leading or end whitespace
*/
public boolean isLeadingAndEndWhitespacesAllowed() {
return leadingAndEndWhitespacesAllowed;
}
}
This diff is collapsed.
package org.htmlcleaner;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* <p>DOM serializer - creates xml DOM.</p>
*/
public class DomSerializer {
protected CleanerProperties props;
protected boolean escapeXml = true;
public DomSerializer(CleanerProperties props, boolean escapeXml) {
this.props = props;
this.escapeXml = escapeXml;
}
public DomSerializer(CleanerProperties props) {
this(props, true);
}
public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
Document document = factory.newDocumentBuilder().newDocument();
Element rootElement = document.createElement(rootNode.getName());
document.appendChild(rootElement);
createSubnodes(document, rootElement, rootNode.getAllChildren());
return document;
}
protected boolean isScriptOrStyle(Element element) {
String tagName = element.getNodeName();
return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
}
/**
* encapsulate content with <[CDATA[ ]]> for things like script and style elements
* @param element
* @return true if <[CDATA[ ]]> should be used.
*/
protected boolean dontEscape(Element element) {
// make sure <script src=..></script> doesn't get turned into <script src=..><[CDATA[]]></script>
// TODO check for blank content as well.
return props.isUseCdataForScriptAndStyle() && isScriptOrStyle(element) && !element.hasChildNodes();
}
private void createSubnodes(Document document, Element element, List tagChildren) {
if (tagChildren != null) {
for(Object item : tagChildren) {
if (item instanceof CommentNode) {
CommentNode commentNode = (CommentNode) item;
Comment comment = document.createComment( commentNode.getContent() );
element.appendChild(comment);
} else if (item instanceof ContentNode) {
ContentNode contentNode = (ContentNode) item;
String content = contentNode.getContent();
boolean specialCase = dontEscape(element);
if (escapeXml && !specialCase) {
content = Utils.escapeXml(content, props, true);
}
element.appendChild( specialCase ? document.createCDATASection(content) : document.createTextNode(content) );
} else if (item instanceof TagNode) {
TagNode subTagNode = (TagNode) item;
Element subelement = document.createElement( subTagNode.getName() );
Map attributes = subTagNode.getAttributes();
Iterator entryIterator = attributes.entrySet().iterator();
while (entryIterator.hasNext()) {
Map.Entry entry = (Map.Entry) entryIterator.next();
String attrName = (String) entry.getKey();
String attrValue = (String) entry.getValue();
if (escapeXml) {
attrValue = Utils.escapeXml(attrValue, props, true);
}
subelement.setAttribute(attrName, attrValue);
}
// recursively create subnodes
createSubnodes(document, subelement, subTagNode.getAllChildren());
element.appendChild(subelement);
} else if (item instanceof List) {
List sublist = (List) item;
createSubnodes(document, element, sublist);
}
}
}
}
}
\ No newline at end of file
This diff is collapsed.
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* <p>General HtmlCleaner runtime exception.</p>
*/
public class HtmlCleanerException extends RuntimeException {
public HtmlCleanerException() {
this("HtmlCleaner expression occureed!");
}
public HtmlCleanerException(Throwable cause) {
super(cause);
}
public HtmlCleanerException(String message) {
super(message);
}
public HtmlCleanerException(String message, Throwable cause) {
super(message, cause);
}
}
\ No newline at end of file
This diff is collapsed.
package org.htmlcleaner;
/**
* Marker interface denoting nodes of the document tree
*/
public interface HtmlNode {
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* <p>
* Provides set of TagInfo instances. The instance of this interface is used as a
* collection of tag definitions used in cleanup process. Implementing this interface
* desired behaviour of cleaner can be achived.<br/>
* In most cases implementation will be or contain a kind of Map.
* </p>
*/
public interface ITagInfoProvider {
public TagInfo getTagInfo(String tagName);
}
package org.htmlcleaner;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.jdom2.Comment;
import org.jdom2.DefaultJDOMFactory;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.jdom2.Text;
/**
* <p>
* JDom serializer - creates xml JDom instance out of the TagNode.
* </p>
*/
public class JDomSerializer {
private DefaultJDOMFactory factory;
protected CleanerProperties props;
protected boolean escapeXml = true;
public JDomSerializer(CleanerProperties props, boolean escapeXml) {
this.props = props;
this.escapeXml = escapeXml;
}
public JDomSerializer(CleanerProperties props) {
this(props, true);
}
public Document createJDom(TagNode rootNode) {
this.factory = new DefaultJDOMFactory();
Element rootElement = createElement(rootNode);
Document document = this.factory.document(rootElement);
setAttributes(rootNode, rootElement);
createSubnodes(rootElement, rootNode.getAllChildren());
return document;
}
private Element createElement(TagNode node) {
String name = node.getName();
boolean nsAware = props.isNamespacesAware();
String prefix = Utils.getXmlNSPrefix(name);
Map<String, String> nsDeclarations = node.getNamespaceDeclarations();
String nsURI = null;
if (prefix != null) {
name = Utils.getXmlName(name);
if