Commit 6486f9a7 authored by Barnaby Walters's avatar Barnaby Walters

Fixed merge conflict

parents 18f6ffd6 b2769bf2
......@@ -360,6 +360,12 @@ class Parser {
}
public function textContent(DOMElement $el) {
$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');
if (isset($el->tagName) and in_array(strtolower($el->tagName), $excludeTags)) {
return '';
}
$this->resolveChildUrls($el);
$clonedEl = $el->cloneNode(true);
......@@ -368,8 +374,14 @@ class Parser {
$newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
$imgEl->parentNode->replaceChild($newNode, $imgEl);
}
foreach ($excludeTags as $tagName) {
foreach ($this->xpath->query(".//{$tagName}", $clonedEl) as $elToRemove) {
$elToRemove->parentNode->removeChild($elToRemove);
}
}
return $clonedEl->textContent;
return $this->innerText($clonedEl);
}
/**
......@@ -390,27 +402,20 @@ class Parser {
$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');
if (isset($el->tagName))
{
if (isset($el->tagName)) {
if (in_array(strtolower($el->tagName), $excludeTags)) {
return $out;
}
else if ($el->tagName == 'img') {
} else if ($el->tagName == 'img') {
if ($el->getAttribute('alt') !== '') {
return $el->getAttribute('alt');
}
else if (!$implied && $el->getAttribute('src') !== '') {
{
} else if (!$implied && $el->getAttribute('src') !== '') {
return $this->resolveUrl($el->getAttribute('src'));
}
}
} else if ($el->tagName == 'area' and $el->getAttribute('alt') !== '') {
return $el->getAttribute('alt');
} else if ($el->tagName == 'abbr' and $el->getAttribute('title') !== '') {
return $el->getAttribute('title');
}
}
// if node is a text node get its text
......@@ -420,32 +425,22 @@ class Parser {
// get the text of the child nodes
if ($el->childNodes && $el->childNodes->length > 0) {
for ($j = 0; $j < $el->childNodes->length; $j++) {
$text = $this->innerText($el->childNodes->item($j), $implied);
if ( !is_null($text) )
{
if (!is_null($text)) {
$out .= $text;
}
}
}
if (isset($el->tagName)) {
// if its a block level tag add an additional space at the end
if (in_array(strtolower($el->tagName), $blockLevelTags))
{
if (in_array(strtolower($el->tagName), $blockLevelTags)) {
$out .= ' ';
}
// else if its a br, replace with newline
else if (strtolower($el->tagName) == 'br')
{
} else if (strtolower($el->tagName) == 'br') {
// else if its a br, replace with newline
$out .= "\n";
}
}
return ($out === '') ? NULL : $out;
......@@ -455,11 +450,12 @@ class Parser {
public function resolveUrl($url) {
// If the URL is seriously malformed it’s probably beyond the scope of this
// parser to try to do anything with it.
if (parse_url($url) === false)
if (parse_url($url) === false) {
return $url;
}
// per issue #40 valid URLs could have a space on either side
$url = trim($url);
// per issue #40 valid URLs could have a space on either side
$url = trim($url);
$scheme = parse_url($url, PHP_URL_SCHEME);
......@@ -677,7 +673,7 @@ class Parser {
if (!empty($value))
$dtValue = $value;
else
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
} elseif ($dt->tagName == 'abbr') {
// Use @title, otherwise innertext
// Is it an entire dt?
......@@ -685,7 +681,7 @@ class Parser {
if (!empty($title))
$dtValue = $title;
else
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
} elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
// Use @datetime if available, otherwise innertext
// Is it an entire dt?
......@@ -693,9 +689,9 @@ class Parser {
if (!empty($dtAttr))
$dtValue = $dtAttr;
else
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
} else {
$dtValue = $dt->nodeValue;
$dtValue = $this->textContent($dt);
}
if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
......@@ -740,10 +736,16 @@ class Parser {
return array(
'html' => $html,
'value' => unicodeTrim($this->textContent($e))
'value' => unicodeTrim($this->innerText($e))
);
}
private function removeTags(\DOMElement &$e, $tagName) {
while(($r = $e->getElementsByTagName($tagName)) && $r->length) {
$r->item(0)->parentNode->removeChild($r->item(0));
}
}
/**
* Recursively parse microformats
*
......@@ -1422,7 +1424,7 @@ function resolveUrl($baseURI, $referenceURI) {
# 5.2.3 Merge Paths
function mergePaths($base, $reference) {
# If the base URI has a defined authority component and an empty
# path,
# path,
if($base['authority'] && $base['path'] == null) {
# then return a string consisting of "/" concatenated with the
# reference's path; otherwise,
......@@ -1430,13 +1432,13 @@ function mergePaths($base, $reference) {
} else {
if(($pos=strrpos($base['path'], '/')) !== false) {
# return a string consisting of the reference's path component
# appended to all but the last segment of the base URI's path (i.e.,
# excluding any characters after the right-most "/" in the base URI
# path,
# appended to all but the last segment of the base URI's path (i.e.,
# excluding any characters after the right-most "/" in the base URI
# path,
$merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
} else {
# or excluding the entire base URI path if it does not contain
# any "/" characters).
# or excluding the entire base URI path if it does not contain
# any "/" characters).
$merged = $base['path'];
}
}
......
......@@ -326,6 +326,7 @@ EOT;
$this->assertEquals('https://example.com/banner-HD.jpeg 2x, https://example.com/banner-phone.jpeg 100w, https://example.com/banner-phone-HD.jpeg 100w 2x', $result);
}
/**
* @see https://github.com/indieweb/php-mf2/issues/84
*/
......@@ -334,4 +335,65 @@ EOT;
$this->assertEquals('https://aaronparecki.com/2014/12/23/5/photo.jpeg', $mf['items'][0]['properties']['photo'][0]);
}
public function testScriptTagContentsRemovedFromTextValue() {
$input = <<<EOT
<div class="h-entry">
<div class="p-content">
<b>Hello World</b>
<script>alert("hi");</script>
</div>
</div>
EOT;
$parser = new Parser($input);
$output = $parser->parse();
$this->assertContains('h-entry', $output['items'][0]['type']);
$this->assertContains('Hello World', $output['items'][0]['properties']['content'][0]);
$this->assertNotContains('alert', $output['items'][0]['properties']['content'][0]);
}
public function testScriptElementContentsRemovedFromAllPlaintextValues() {
$input = <<<EOT
<div class="h-entry">
<span class="dt-published">contained<script>not contained</script><style>not contained</style></span>
<span class="u-url">contained<script>not contained</script><style>not contained</style></span>
</div>
EOT;
$parser = new Parser($input);
$output = $parser->parse();
$this->assertNotContains('not contained', $output['items'][0]['properties']['published'][0]);
$this->assertNotContains('not contained', $output['items'][0]['properties']['url'][0]);
}
public function testScriptTagContentsNotRemovedFromHTMLValue() {
$input = <<<EOT
<div class="h-entry">
<div class="e-content">
<b>Hello World</b>
<script>alert("hi");</script>
<style>body{ visibility: hidden; }</style>
<p>
<script>alert("hi");</script>
<style>body{ visibility: hidden; }</style>
</p>
</div>
</div>
EOT;
$parser = new Parser($input);
$output = $parser->parse();
$this->assertContains('h-entry', $output['items'][0]['type']);
$this->assertContains('Hello World', $output['items'][0]['properties']['content'][0]['value']);
$this->assertContains('<b>Hello World</b>', $output['items'][0]['properties']['content'][0]['html']);
# The script and style tags should be removed from plaintext results but left in HTML results.
$this->assertContains('alert', $output['items'][0]['properties']['content'][0]['html']);
$this->assertNotContains('alert', $output['items'][0]['properties']['content'][0]['value']);
$this->assertContains('visibility', $output['items'][0]['properties']['content'][0]['html']);
$this->assertNotContains('visibility', $output['items'][0]['properties']['content'][0]['value']);
}
}
......@@ -212,9 +212,6 @@ class UrlTest extends PHPUnit_Framework_TestCase {
array('relative add host from base',
'http://www.example.com', 'server.php', 'http://www.example.com/server.php'),
array('relative add scheme host user from base',
'http://user:@www.example.com', 'server.php', 'http://user:@www.example.com/server.php'),
array('relative add scheme host pass from base',
'http://:pass@www.example.com', 'server.php', 'http://:pass@www.example.com/server.php'),
......@@ -256,6 +253,15 @@ class UrlTest extends PHPUnit_Framework_TestCase {
);
// PHP 5.4 and before returns a different result, but either are acceptable
if(PHP_MAJOR_VERSION <= 5 && PHP_MINOR_VERSION <= 4) {
$cases[] = array('relative add scheme host user from base',
'http://user:@www.example.com', 'server.php', 'http://user@www.example.com/server.php');
} else {
$cases[] = array('relative add scheme host user from base',
'http://user:@www.example.com', 'server.php', 'http://user:@www.example.com/server.php');
}
// Test cases from RFC
// http://tools.ietf.org/html/rfc3986#section-5.4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment