Commit b2769bf2 authored by Barnaby Walters's avatar Barnaby Walters

Revised PR to comply with revised script/style element proposal

parent 1f34c9e8
......@@ -357,6 +357,12 @@ class Parser {
}
public function textContent(DOMElement $el) {
$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');
if (isset($el->tagName) and in_array(strtolower($el->tagName), $excludeTags)) {
return '';
}
$this->resolveChildUrls($el);
$clonedEl = $el->cloneNode(true);
......@@ -365,8 +371,14 @@ class Parser {
$newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
$imgEl->parentNode->replaceChild($newNode, $imgEl);
}
foreach ($excludeTags as $tagName) {
foreach ($this->xpath->query(".//{$tagName}", $clonedEl) as $elToRemove) {
$elToRemove->parentNode->removeChild($elToRemove);
}
}
return $clonedEl->textContent;
return $this->innerText($clonedEl);
}
/**
......@@ -387,27 +399,20 @@ class Parser {
$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');
if (isset($el->tagName))
{
if (isset($el->tagName)) {
if (in_array(strtolower($el->tagName), $excludeTags)) {
return $out;
}
else if ($el->tagName == 'img') {
} else if ($el->tagName == 'img') {
if ($el->getAttribute('alt') !== '') {
return $el->getAttribute('alt');
}
else if (!$implied && $el->getAttribute('src') !== '') {
{
} else if (!$implied && $el->getAttribute('src') !== '') {
return $this->resolveUrl($el->getAttribute('src'));
}
}
} else if ($el->tagName == 'area' and $el->getAttribute('alt') !== '') {
return $el->getAttribute('alt');
} else if ($el->tagName == 'abbr' and $el->getAttribute('title') !== '') {
return $el->getAttribute('title');
}
}
// if node is a text node get its text
......@@ -417,32 +422,22 @@ class Parser {
// get the text of the child nodes
if ($el->childNodes && $el->childNodes->length > 0) {
for ($j = 0; $j < $el->childNodes->length; $j++) {
$text = $this->innerText($el->childNodes->item($j), $implied);
if ( !is_null($text) )
{
if (!is_null($text)) {
$out .= $text;
}
}
}
if (isset($el->tagName)) {
// if its a block level tag add an additional space at the end
if (in_array(strtolower($el->tagName), $blockLevelTags))
{
if (in_array(strtolower($el->tagName), $blockLevelTags)) {
$out .= ' ';
}
// else if its a br, replace with newline
else if (strtolower($el->tagName) == 'br')
{
} else if (strtolower($el->tagName) == 'br') {
// else if its a br, replace with newline
$out .= "\n";
}
}
return ($out === '') ? NULL : $out;
......@@ -452,11 +447,12 @@ class Parser {
public function resolveUrl($url) {
// If the URL is seriously malformed it’s probably beyond the scope of this
// parser to try to do anything with it.
if (parse_url($url) === false)
if (parse_url($url) === false) {
return $url;
}
// per issue #40 valid URLs could have a space on either side
$url = trim($url);
// per issue #40 valid URLs could have a space on either side
$url = trim($url);
$scheme = parse_url($url, PHP_URL_SCHEME);
......@@ -674,7 +670,7 @@ class Parser {
if (!empty($value))
$dtValue = $value;
else
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
} elseif ($dt->tagName == 'abbr') {
// Use @title, otherwise innertext
// Is it an entire dt?
......@@ -682,7 +678,7 @@ class Parser {
if (!empty($title))
$dtValue = $title;
else
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
} elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
// Use @datetime if available, otherwise innertext
// Is it an entire dt?
......@@ -690,9 +686,9 @@ class Parser {
if (!empty($dtAttr))
$dtValue = $dtAttr;
else
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
} else {
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
}
if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
......@@ -730,9 +726,6 @@ class Parser {
// TODO: as it is this is not relative to only children, make this .// and rerun tests
$this->resolveChildUrls($e);
$this->removeTags($e, 'script');
$this->removeTags($e, 'style');
$html = '';
foreach ($e->childNodes as $node) {
$html .= $node->C14N();
......@@ -1428,7 +1421,7 @@ function resolveUrl($baseURI, $referenceURI) {
# 5.2.3 Merge Paths
function mergePaths($base, $reference) {
# If the base URI has a defined authority component and an empty
# path,
# path,
if($base['authority'] && $base['path'] == null) {
# then return a string consisting of "/" concatenated with the
# reference's path; otherwise,
......@@ -1436,13 +1429,13 @@ function mergePaths($base, $reference) {
} else {
if(($pos=strrpos($base['path'], '/')) !== false) {
# return a string consisting of the reference's path component
# appended to all but the last segment of the base URI's path (i.e.,
# excluding any characters after the right-most "/" in the base URI
# path,
# appended to all but the last segment of the base URI's path (i.e.,
# excluding any characters after the right-most "/" in the base URI
# path,
$merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
} else {
# or excluding the entire base URI path if it does not contain
# any "/" characters).
# or excluding the entire base URI path if it does not contain
# any "/" characters).
$merged = $base['path'];
}
}
......
......@@ -343,8 +343,23 @@ EOT;
$this->assertContains('Hello World', $output['items'][0]['properties']['content'][0]);
$this->assertNotContains('alert', $output['items'][0]['properties']['content'][0]);
}
public function testScriptElementContentsRemovedFromAllPlaintextValues() {
$input = <<<EOT
<div class="h-entry">
<span class="dt-published">contained<script>not contained</script><style>not contained</style></span>
<span class="u-url">contained<script>not contained</script><style>not contained</style></span>
</div>
EOT;
$parser = new Parser($input);
$output = $parser->parse();
$this->assertNotContains('not contained', $output['items'][0]['properties']['published'][0]);
$this->assertNotContains('not contained', $output['items'][0]['properties']['url'][0]);
}
public function testScriptTagContentsRemovedFromHTMLValue() {
public function testScriptTagContentsNotRemovedFromHTMLValue() {
$input = <<<EOT
<div class="h-entry">
<div class="e-content">
......@@ -365,10 +380,10 @@ EOT;
$this->assertContains('h-entry', $output['items'][0]['type']);
$this->assertContains('Hello World', $output['items'][0]['properties']['content'][0]['value']);
$this->assertContains('<b>Hello World</b>', $output['items'][0]['properties']['content'][0]['html']);
# The script and style tags should be removed from both HTML and plaintext results
$this->assertNotContains('alert', $output['items'][0]['properties']['content'][0]['html']);
# The script and style tags should be removed from plaintext results but left in HTML results.
$this->assertContains('alert', $output['items'][0]['properties']['content'][0]['html']);
$this->assertNotContains('alert', $output['items'][0]['properties']['content'][0]['value']);
$this->assertNotContains('visibility', $output['items'][0]['properties']['content'][0]['html']);
$this->assertContains('visibility', $output['items'][0]['properties']['content'][0]['html']);
$this->assertNotContains('visibility', $output['items'][0]['properties']['content'][0]['value']);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment