Commit 0e4e5374 authored by Alexandre Rossi's avatar Alexandre Rossi

New upstream version 2.21

parent a193977a
Manifest-Version: 1.0
Ant-Version: Apache Ant 1.8.4
Main-Class: org.htmlcleaner.CommandLine
Manifest-Version: 1.0
Ant-Version: Apache Ant 1.8.4
Main-Class: org.htmlcleaner.CommandLine
\ No newline at end of file
This diff is collapsed.
Copyright (c) 2006-2013, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
Copyright (c) 2006-2015, the HTMLCleaner project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
\ No newline at end of file
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
Additional work by Amplafi. -- All rights released.
*/
package org.htmlcleaner;
public interface AttributeTransformation {
boolean satisfy(String attName, String attValue);
String getTemplate();
}
\ No newline at end of file
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
Additional work by Amplafi. -- All rights released.
*/
package org.htmlcleaner;
import java.util.regex.Pattern;
public class AttributeTransformationPatternImpl implements AttributeTransformation {
private final Pattern attNamePattern;
private final Pattern attValuePattern;
private final String template;
public AttributeTransformationPatternImpl(Pattern attNamePattern, Pattern attValuePattern, String template) {
this.attNamePattern = attNamePattern;
this.attValuePattern = attValuePattern;
this.template = template;
}
public AttributeTransformationPatternImpl(String attNamePattern, String attValuePattern, String template) {
this.attNamePattern = attNamePattern ==null?null:Pattern.compile(attNamePattern);
this.attValuePattern = attValuePattern == null? null: Pattern.compile(attValuePattern);
this.template = template;
}
public boolean satisfy(String attName, String attValue) {
if ( (attNamePattern == null || attNamePattern.matcher(attName).find()) && (attValuePattern ==null || attValuePattern.matcher(attValue).find())){
return true;
} else {
return false;
}
}
/**
* @return the template
*/
public String getTemplate() {
return template;
}
}
\ No newline at end of file
package org.htmlcleaner;
/**
* Base class for all tokens. Allows position tracking.
*
* @author Konstantin Burov (aectann@gmail.com)
*
*/
public abstract class BaseTokenImpl implements BaseToken {
private int row;
private int col;
protected BaseTokenImpl(){
}
protected BaseTokenImpl(int row, int col) {
this.row = row;
this.col = col;
}
public int getRow() {
return row;
}
public void setRow(int row) {
this.row = row;
}
public int getCol() {
return col;
}
public void setCol(int col) {
this.col = col;
}
@Override
public String toString() {
return "(line="+getRow()+", col="+getCol()+")";
}
}
/*
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* @author patmoore
*
*/
public enum BelongsTo {
HEAD_AND_BODY("all"),
HEAD("head"),
BODY("body");
private final String dbCode;
private BelongsTo(String dbCode) {
this.dbCode =dbCode;
}
/**
* @return the dbCode
*/
public String getDbCode() {
return dbCode;
}
public static BelongsTo toValue(Object value) {
BelongsTo result = null;
if ( value instanceof BelongsTo) {
result = (BelongsTo) value;
} else if ( value != null ) {
String dbCode = value.toString().trim();
for(BelongsTo belongsTo: BelongsTo.values()) {
if ( belongsTo.getDbCode().equalsIgnoreCase(dbCode) || belongsTo.name().equalsIgnoreCase(dbCode)) {
result = belongsTo;
break;
}
}
}
return result;
}
}
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.StringTokenizer;
/**
* <p>
* Browser compact XML serializer - creates resulting XML by stripping whitespaces wherever possible,
* but preserving single whitespace where at least one exists. This behaviour is well suited
* for web-browsers, which usually treat multiple whitespaces as single one, but make difference
* between single whitespace and empty text.
* </p>
*/
public class BrowserCompactXmlSerializer extends XmlSerializer {
private static final String PRE_TAG = "pre";
private static final String BR_TAG = "<br />";
private static final String LINE_BREAK = "\n";
public BrowserCompactXmlSerializer(CleanerProperties props) {
super(props);
}
@Override
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializeOpenTag(tagNode, writer, false);
TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
String tagName = tagInfo!=null? tagInfo.getName() : null;
List tagChildren = new ArrayList (tagNode.getAllChildren());
if (!isMinimizedTagSyntax(tagNode)) {
ListIterator childrenIt = tagChildren.listIterator();
while (childrenIt.hasNext()) {
Object item = childrenIt.next();
if (item != null) {
if (item instanceof ContentNode && !PRE_TAG.equals(tagName)) {
String content = ((ContentNode) item).getContent();
content = dontEscape(tagNode) ? content.replaceAll("]]>", "]]&gt;") : escapeXml(content);
content = content.replaceAll("^"+SpecialEntities.NON_BREAKABLE_SPACE+"+", " ");
content = content.replaceAll(SpecialEntities.NON_BREAKABLE_SPACE+"+$", " ");
boolean whitespaceAllowed = tagInfo != null && tagInfo.getDisplay().isLeadingAndEndWhitespacesAllowed();
boolean writeLeadingSpace = content.length() > 0 && (Character.isWhitespace(content.charAt(0))) && whitespaceAllowed;
boolean writeEndingSpace = content.length() > 1 && Character.isWhitespace(content.charAt(content.length() - 1)) && whitespaceAllowed;
content = content.trim();
if (content.length() != 0) {
boolean hasPrevContent = false;
int order = tagChildren.indexOf(item);
if (order >= 2 && childrenIt.hasNext()) {
Object prev = tagChildren.get(order-1);
hasPrevContent = isContentOrInline(prev);
}
if (writeLeadingSpace || hasPrevContent) {
writer.write(' ');
}
StringTokenizer tokenizer = new StringTokenizer(content, LINE_BREAK, true);
String prevToken = "";
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (prevToken.equals(token) && prevToken.equals(LINE_BREAK)) {
writer.write(BR_TAG);
prevToken = "";
} else if (LINE_BREAK.equals(token)) {
writer.write(' ');
} else {
writer.write(token.trim());
}
prevToken = token;
}
boolean hasFollowingContent = false;
if (childrenIt.hasNext()) {
Object next = childrenIt.next();
hasFollowingContent = isContentOrInline(next);
childrenIt.previous();
}
if (writeEndingSpace || hasFollowingContent) {
writer.write(' ');
}
} else{
childrenIt.remove();
}
} else if(item instanceof ContentNode){
String content = ((ContentNode) item).getContent();
writer.write(content);
} else if (item instanceof CommentNode) {
String content = ((CommentNode) item).getCommentedContent().trim();
writer.write(content);
} else {
((BaseToken)item).serialize(this, writer);
}
}
}
serializeEndTag(tagNode, writer, tagInfo != null && tagInfo.getDisplay().isAfterTagLineBreakNeeded());
}
}
private boolean isContentOrInline(Object node) {
boolean result = false;
if (node instanceof ContentNode) {
result = true;
} else if (node instanceof TagNode) {
TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName());
result = nextInfo != null && nextInfo.getDisplay() == Display.inline;
}
return result;
}
}
\ No newline at end of file
This diff is collapsed.
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* Contains transformation collection.
*/
public class CleanerTransformations {
private Map mappings = new HashMap();
private TagTransformation globalTransformations=new TagTransformation();
public CleanerTransformations() {
}
/**
* @param transInfos
*/
public CleanerTransformations(Map transInfos) {
updateTagTransformations(transInfos);
}
/**
* Adds specified tag transformation to the collection.
* @param tagTransformation
*/
public void addTransformation(TagTransformation tagTransformation) {
if (tagTransformation != null) {
mappings.put( tagTransformation.getSourceTag(), tagTransformation );
}
}
public void addGlobalTransformation(AttributeTransformation attributeTransformation) {
globalTransformations.addAttributePatternTransformation(attributeTransformation);
}
public boolean hasTransformationForTag(String tagName) {
return tagName != null && mappings.containsKey(tagName.toLowerCase());
}
public TagTransformation getTransformation(String tagName) {
return tagName != null ? (TagTransformation) mappings.get(tagName.toLowerCase()) : null;
}
public void updateTagTransformations(String key, String value) {
int index = key.indexOf('.');
// new tag transformation case (tagname[=destname[,preserveatts]])
if (index <= 0) {
String destTag = null;
boolean preserveSourceAtts = true;
if (value != null) {
String[] tokens = Utils.tokenize(value, ",;");
if (tokens.length > 0) {
destTag = tokens[0];
}
if (tokens.length > 1) {
preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) ||
"yes".equalsIgnoreCase(tokens[1]) ||
"1".equals(tokens[1]);
}
}
TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts);
addTransformation(newTagTrans);
} else { // attribute transformation description
String[] parts = Utils.tokenize(key, ".");
String tagName = parts[0];
TagTransformation trans = getTransformation(tagName);
if (trans != null) {
trans.addAttributeTransformation(parts[1], value);
}
}
}
public void updateTagTransformations(Map transInfos) {
Iterator iterator = transInfos.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry entry = (Map.Entry) iterator.next();
String tag = (String) entry.getKey();
String value = (String) entry.getValue();
updateTagTransformations(tag, value);
}
}
public Map<String, String> transformAttributes(String originalTagName, Map<String, String> attributes) {
TagTransformation tagTrans = getTransformation(originalTagName);
Map<String, String> results;
if ( tagTrans != null ) {
results = tagTrans.applyTagTransformations(attributes);
} else {
results = attributes;
}
return this.globalTransformations.applyTagTransformations(results);
}
public String getTagName(String tagName) {
TagTransformation tagTransformation = null;
if (hasTransformationForTag(tagName)) {
tagTransformation = getTransformation(tagName);
if (tagTransformation != null) {
return tagTransformation.getDestTag();
}
}
return tagName;
}
/**