Parser.php 59.3 KB
Newer Older
1 2
<?php

3
namespace Mf2;
4

5 6 7 8 9 10
use DOMDocument;
use DOMElement;
use DOMXPath;
use DOMNode;
use DOMNodeList;
use Exception;
11
use SplObjectStorage;
Barnaby Walters's avatar
Barnaby Walters committed
12
use stdClass;
Barnaby Walters's avatar
Barnaby Walters committed
13

14 15
/**
 * Parse Microformats2
16
 *
17
 * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
18
 *
19
 * Example usage:
20
 *
21 22 23
 *     use Mf2;
 *     $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
 *     echo json_encode($output, JSON_PRETTY_PRINT);
24
 *
25
 * Produces:
26
 *
27 28 29 30 31 32 33 34 35 36 37
 *     {
 *      "items": [
 *       {
 *        "type": ["h-card"],
 *        "properties": {
 *         "name": ["Barnaby Walters"]
 *        }
 *       }
 *      ],
 *      "rels": {}
 *     }
38
 *
39 40 41 42 43 44 45 46
 * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
 * @param string $url The URL the input document was found at, for relative URL resolution
 * @param bool $convertClassic whether or not to convert classic microformats
 * @return array Canonical MF2 array structure
 */
function parse($input, $url = null, $convertClassic = true) {
	$parser = new Parser($input, $url);
	return $parser->parse($convertClassic);
47 48
}

Barnaby Walters's avatar
Barnaby Walters committed
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
/**
 * Fetch microformats2
 *
 * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
 * microformats2 array structure.
 *
 * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
 * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
 * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
 * for the actual value.
 *
 * @param string $url The URL to fetch
 * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
 * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
 * @return array|null canonical microformats2 array structure on success, null on failure
 */
function fetch($url, $convertClassic = true, &$curlInfo=null) {
	$ch = curl_init();
	curl_setopt($ch, CURLOPT_URL, $url);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	curl_setopt($ch, CURLOPT_HEADER, 0);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
	curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
72
	$html = curl_exec($ch);
Barnaby Walters's avatar
Barnaby Walters committed
73 74 75 76 77 78 79 80
	$info = $curlInfo = curl_getinfo($ch);
	curl_close($ch);

	if (strpos(strtolower($info['content_type']), 'html') === false) {
		// The content was not delivered as HTML, do not attempt to parse it.
		return null;
	}

81 82 83
	# ensure the final URL is used to resolve relative URLs
	$url = $info['url'];

Barnaby Walters's avatar
Barnaby Walters committed
84 85 86
	return parse($html, $url, $convertClassic);
}

87 88 89
/**
 * Unicode to HTML Entities
 * @param string $input String containing characters to convert into HTML entities
90
 * @return string
91 92 93
 */
function unicodeToHtmlEntities($input) {
	return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
94 95
}

96 97
/**
 * Collapse Whitespace
98
 *
99 100
 * Collapses any sequences of whitespace within a string into a single space
 * character.
101
 *
102
 * @deprecated since v0.2.3
103 104 105 106 107 108 109
 * @param string $str
 * @return string
 */
function collapseWhitespace($str) {
	return preg_replace('/[\s|\n]+/', ' ', $str);
}

110
function unicodeTrim($str) {
111 112
	// this is cheating. TODO: find a better way if this causes any problems
	$str = str_replace(mb_convert_encoding('&nbsp;', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
113 114 115
	$str = preg_replace('/^\s+/', '', $str);
	return preg_replace('/\s+$/', '', $str);
}
116 117 118

/**
 * Microformat Name From Class string
119 120
 *
 * Given the value of @class, get the relevant mf classnames (e.g. h-card,
121
 * p-name).
122
 *
123 124 125 126
 * @param string $class A space delimited list of classnames
 * @param string $prefix The prefix to look for
 * @return string|array The prefixed name of the first microfomats class found or false
 */
127
function mfNamesFromClass($class, $prefix='h-') {
128
	$class = str_replace(array(' ', '	', "\n"), ' ', $class);
129
	$classes = explode(' ', $class);
130
	$classes = preg_grep('#^[a-z\-]+$#', $classes);
131 132 133
	$matches = array();

	foreach ($classes as $classname) {
134 135 136
		$compare_classname = ' ' . $classname;
		$compare_prefix = ' ' . $prefix;
		if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
137 138 139 140 141 142 143 144 145
			$matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
		}
	}

	return $matches;
}

/**
 * Get Nested µf Property Name From Class
146 147
 *
 * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
148
 * space-separated string.
149
 *
150
 * @param string $class
151
 * @return array
152 153
 */
function nestedMfPropertyNamesFromClass($class) {
154
	$prefixes = array('p-', 'u-', 'dt-', 'e-');
155
	$propertyNames = array();
156

157
	$class = str_replace(array(' ', '	', "\n"), ' ', $class);
158 159
	foreach (explode(' ', $class) as $classname) {
		foreach ($prefixes as $prefix) {
160 161 162 163
			// Check if $classname is a valid property classname for $prefix.
			if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
				$propertyName = mb_substr($classname, mb_strlen($prefix));
				$propertyNames[$propertyName][] = $prefix;
164
			}
165 166
		}
	}
167 168 169 170
	
	foreach ($propertyNames as $property => $prefixes) {
		$propertyNames[$property] = array_unique($prefixes);
	}
171

172
	return $propertyNames;
173 174 175 176
}

/**
 * Wraps mfNamesFromClass to handle an element as input (common)
177
 *
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
 * @param DOMElement $e The element to get the classname for
 * @param string $prefix The prefix to look for
 * @return mixed See return value of mf2\Parser::mfNameFromClass()
 */
function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
	$class = $e->getAttribute('class');
	return mfNamesFromClass($class, $prefix);
}

/**
 * Wraps nestedMfPropertyNamesFromClass to handle an element as input
 */
function nestedMfPropertyNamesFromElement(\DOMElement $e) {
	$class = $e->getAttribute('class');
	return nestedMfPropertyNamesFromClass($class);
}

195 196 197 198 199 200 201
/**
 * Converts various time formats to HH:MM
 * @param string $time The time to convert
 * @return string
 */
function convertTimeFormat($time) {
	$hh = $mm = $ss = '';
202
	preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
203

204
	// If no am/pm is specified:
205
	if (empty($matches[4])) {
206
		return $time;
207 208
	} else {
		// Otherwise, am/pm is specified.
209 210
		$meridiem = strtolower(str_replace('.', '', $matches[4]));

211
		// Hours.
212 213
		$hh = $matches[1];

214
		// Add 12 to hours if pm applies.
215
		if ($meridiem == 'pm' && ($hh < 12)) {
216 217 218 219 220
			$hh += 12;
		}

		$hh = str_pad($hh, 2, '0', STR_PAD_LEFT);

221
		// Minutes.
222
		$mm = (empty($matches[2]) ) ? '00' : $matches[2];
223

224
		// Seconds, only if supplied.
225
		if (!empty($matches[3])) {
226 227 228
			$ss = $matches[3];
		}

229
		if (empty($ss)) {
230 231 232 233 234 235 236 237
			return sprintf('%s:%s', $hh, $mm);
		}
		else {
			return sprintf('%s:%s:%s', $hh, $mm, $ss);
		}
	}
}

238 239 240 241 242 243 244 245 246 247 248 249 250
function applySrcsetUrlTransformation($srcset, $transformation) {
	return implode(', ', array_filter(array_map(function ($srcsetPart) use ($transformation) {
		$parts = explode(" \t\n\r\0\x0B", trim($srcsetPart), 2);
		$parts[0] = rtrim($parts[0]);

		if (empty($parts[0])) { return false; }

		$parts[0] = call_user_func($transformation, $parts[0]);

		return $parts[0] . (empty($parts[1]) ? '' : ' ' . $parts[1]);
	}, explode(',', trim($srcset)))));
}

251 252
/**
 * Microformats2 Parser
253
 *
254
 * A class which holds state for parsing microformats2 from HTML.
255
 *
256
 * Example usage:
257
 *
258 259 260 261
 *     use Mf2;
 *     $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
 *     $output = $parser->parse();
 */
Barnaby Walters's avatar
Barnaby Walters committed
262
class Parser {
263 264 265 266
	/** @var string The baseurl (if any) to use for this parse */
	public $baseurl;

	/** @var DOMXPath object which can be used to query over any fragment*/
267
	public $xpath;
268

269 270
	/** @var DOMDocument */
	public $doc;
271

272
	/** @var SplObjectStorage */
273
	protected $parsed;
274

Barnaby Walters's avatar
Barnaby Walters committed
275
	public $jsonMode;
276

277 278 279 280 281 282 283
	/**
	 * Elements upgraded to mf2 during backcompat
	 * @var SplObjectStorage
	 */
	protected $upgraded;


284 285
	/**
	 * Constructor
286
	 *
287
	 * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
288
	 * @param string $url The URL of the parsed document, for relative URL resolution
Barnaby Walters's avatar
Barnaby Walters committed
289
	 * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
290
	 */
Barnaby Walters's avatar
Barnaby Walters committed
291
	public function __construct($input, $url = null, $jsonMode = false) {
292
		libxml_use_internal_errors(true);
293
		if (is_string($input)) {
294 295
			$doc = new DOMDocument();
			@$doc->loadHTML(unicodeToHtmlEntities($input));
296 297 298 299 300 301
		} elseif (is_a($input, 'DOMDocument')) {
			$doc = $input;
		} else {
			$doc = new DOMDocument();
			@$doc->loadHTML('');
		}
302

303
		$this->xpath = new DOMXPath($doc);
304

305
		$baseurl = $url;
306
		foreach ($this->xpath->query('//base[@href]') as $base) {
307
			$baseElementUrl = $base->getAttribute('href');
308

309 310 311 312 313 314
			if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
				/* The base element URL is relative to the document URL.
				 *
				 * :/
				 *
				 * Perhaps the author was high? */
315

316
				$baseurl = resolveUrl($url, $baseElementUrl);
317 318 319
			} else {
				$baseurl = $baseElementUrl;
			}
320 321
			break;
		}
322 323 324 325 326

		// Ignore <template> elements as per the HTML5 spec
		foreach ($this->xpath->query('//template') as $templateEl) {
			$templateEl->parentNode->removeChild($templateEl);
		}
327

328
		$this->baseurl = $baseurl;
329
		$this->doc = $doc;
330
		$this->parsed = new SplObjectStorage();
331
		$this->upgraded = new SplObjectStorage();
Barnaby Walters's avatar
Barnaby Walters committed
332
		$this->jsonMode = $jsonMode;
333
	}
334

335 336 337
	private function elementPrefixParsed(\DOMElement $e, $prefix) {
		if (!$this->parsed->contains($e))
			$this->parsed->attach($e, array());
338

339 340 341 342
		$prefixes = $this->parsed[$e];
		$prefixes[] = $prefix;
		$this->parsed[$e] = $prefixes;
	}
343

344 345 346 347 348 349
	/**
	 * Determine if the element has already been parsed
	 * @param DOMElement $e
	 * @param string $prefix
	 * @return bool
	 */
350
	private function isElementParsed(\DOMElement $e, $prefix) {
351
		if (!$this->parsed->contains($e)) {
352
			return false;
353 354
		}
			
355
		$prefixes = $this->parsed[$e];
356

357
		if (!in_array($prefix, $prefixes)) {
358
			return false;
359
		}
360

361 362
		return true;
	}
363

364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
	/**
	 * Determine if the element's specified property has already been upgraded during backcompat
	 * @param DOMElement $el
	 * @param string $property
	 * @return bool
	 */
	private function isElementUpgraded(\DOMElement $el, $property) {
		if ( $this->upgraded->contains($el) ) {
			if ( in_array($property, $this->upgraded[$el]) ) {
				return true;
			}
		}

		return false;
	}

380 381 382 383 384 385 386 387
	private function resolveChildUrls(DOMElement $el) {
		$hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);

		foreach ($hyperlinkChildren as $child) {
			if ($child->hasAttribute('href'))
				$child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
			if ($child->hasAttribute('src'))
				$child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
388
			if ($child->hasAttribute('srcset'))
389
				$child->setAttribute('srcset', applySrcsetUrlTransformation($child->getAttribute('href'), array($this, 'resolveUrl')));
390 391 392 393 394 395
			if ($child->hasAttribute('data'))
				$child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
		}
	}

	public function textContent(DOMElement $el) {
396 397 398 399 400 401
		$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');
		
		if (isset($el->tagName) and in_array(strtolower($el->tagName), $excludeTags)) {
			return '';
		}
		
402 403 404 405 406 407 408 409
		$this->resolveChildUrls($el);

		$clonedEl = $el->cloneNode(true);

		foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) {
			$newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
			$imgEl->parentNode->replaceChild($newNode, $imgEl);
		}
410 411 412 413 414 415
		
		foreach ($excludeTags as $tagName) {
			foreach ($this->xpath->query(".//{$tagName}", $clonedEl) as $elToRemove) {
				$elToRemove->parentNode->removeChild($elToRemove);
			}
		}
416

417
		return $this->innerText($clonedEl);
418 419
	}

420 421 422 423 424 425 426
	/**
	 * This method attempts to return a better 'innerText' representation than DOMNode::textContent
	 *
	 * @param DOMElement|DOMText $el
	 * @param bool $implied when parsing for implied name for h-*, rules may be slightly different
	 * @see: https://github.com/glennjones/microformat-shiv/blob/dev/lib/text.js
	 */
427
	public function innerText($el, $implied=false) {
428 429 430 431 432 433 434 435 436
		$out = '';

		$blockLevelTags = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table',
			'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div', 
			'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form',  'header', 'hgroup', 'hr', 
			'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea', 
			'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details');

		$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');
437 438 439 440
		
		// PHP DOMDocument doesn’t correctly handle whitespace around elements it doesn’t recognise.
		$unsupportedTags = array('data');
		
441
		if (isset($el->tagName)) {
442 443
			if (in_array(strtolower($el->tagName), $excludeTags)) {
				return $out;
444
			} else if ($el->tagName == 'img') {
445
				if ($el->hasAttribute('alt')) {
446
					return $el->getAttribute('alt');
447
				} else if (!$implied && $el->hasAttribute('src')) {
448 449
					return $this->resolveUrl($el->getAttribute('src'));
				}
450
			} else if ($el->tagName == 'area' and $el->hasAttribute('alt')) {
451
				return $el->getAttribute('alt');
452
			} else if ($el->tagName == 'abbr' and $el->hasAttribute('title')) {
453 454 455 456 457
				return $el->getAttribute('title');
			}
		}

		// if node is a text node get its text
Gregor Morrill's avatar
Gregor Morrill committed
458
		if (isset($el->nodeType) && $el->nodeType === 3) {
459 460 461 462 463 464 465
			$out .= $el->textContent;
		}

		// get the text of the child nodes
		if ($el->childNodes && $el->childNodes->length > 0) {
			for ($j = 0; $j < $el->childNodes->length; $j++) {
				$text = $this->innerText($el->childNodes->item($j), $implied);
466
				if (!is_null($text)) {
467 468 469 470 471
					$out .= $text;
				}
			}
		}

Gregor Morrill's avatar
Gregor Morrill committed
472
		if (isset($el->tagName)) {
473
			// if its a block level tag add an additional space at the end
474
			if (in_array(strtolower($el->tagName), $blockLevelTags)) {
475
				$out .= ' ';
476 477
			} elseif ($implied and in_array(strtolower($el->tagName), $unsupportedTags)) {
				$out .= ' ';
478 479
			} else if (strtolower($el->tagName) == 'br') {
				// else if its a br, replace with newline 
480 481 482 483
				$out .= "\n";
			}
		} 

Gregor Morrill's avatar
Gregor Morrill committed
484
		return ($out === '') ? NULL : $out;
485 486
	}

487 488 489 490 491 492 493 494 495 496
	/**
	 * This method parses the language of an element
	 * @param DOMElement $el 
	 * @access public
	 * @return string
	 */
	public function language(DOMElement $el)
	{
		// element has a lang attribute; use it
		if ($el->hasAttribute('lang')) {
497
			return unicodeTrim($el->getAttribute('lang'));
498 499 500 501 502 503 504
		}
		
		if ($el->tagName == 'html') {
			// we're at the <html> element and no lang; check <meta> http-equiv Content-Language
			foreach ( $this->xpath->query('.//meta[@http-equiv]') as $node )
			{
				if ($node->hasAttribute('http-equiv') && $node->hasAttribute('content') && strtolower($node->getAttribute('http-equiv')) == 'content-language') {
505
					return unicodeTrim($node->getAttribute('content'));
506 507 508 509 510 511 512 513 514 515
				}
			}
		} else {
			// check the parent node
			return $this->language($el->parentNode);			
		}

		return '';
	} # end method language()

516
	// TODO: figure out if this has problems with sms: and geo: URLs
Barnaby Walters's avatar
Barnaby Walters committed
517
	public function resolveUrl($url) {
518
		// If the URL is seriously malformed it’s probably beyond the scope of this
519
		// parser to try to do anything with it.
520
		if (parse_url($url) === false) {
521
			return $url;
522
		}
523

524 525
		// per issue #40 valid URLs could have a space on either side
		$url = trim($url);
526

527
		$scheme = parse_url($url, PHP_URL_SCHEME);
528

529
		if (empty($scheme) and !empty($this->baseurl)) {
530
			return resolveUrl($this->baseurl, $url);
531 532 533 534
		} else {
			return $url;
		}
	}
535

536
	// Parsing Functions
537

538
	/**
539
	 * Parse value-class/value-title on an element, joining with $separator if
540
	 * there are multiple.
541
	 *
542 543 544 545 546
	 * @param \DOMElement $e
	 * @param string $separator = '' if multiple value-title elements, join with this string
	 * @return string|null the parsed value or null if value-class or -title aren’t in use
	 */
	public function parseValueClassTitle(\DOMElement $e, $separator = '') {
547
		$valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
548

549 550 551 552
		if ($valueClassElements->length !== 0) {
			// Process value-class stuff
			$val = '';
			foreach ($valueClassElements as $el) {
553
				$val .= $this->textContent($el);
554
			}
555

556
			return unicodeTrim($val);
557
		}
558

559
		$valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
560

561 562 563 564
		if ($valueTitleElements->length !== 0) {
			// Process value-title stuff
			$val = '';
			foreach ($valueTitleElements as $el) {
565
				$val .= $el->getAttribute('title');
566
			}
567

568
			return unicodeTrim($val);
569
		}
570

571 572 573
		// No value-title or -class in this element
		return null;
	}
574

575
	/**
576
	 * Given an element with class="p-*", get its value
577
	 *
578 579 580 581 582 583
	 * @param DOMElement $p The element to parse
	 * @return string The plaintext value of $p, dependant on type
	 * @todo Make this adhere to value-class
	 */
	public function parseP(\DOMElement $p) {
		$classTitle = $this->parseValueClassTitle($p, ' ');
584

585
		if ($classTitle !== null) {
586
			return $classTitle;
587
		}
588

589 590
		$this->resolveChildUrls($p);
		
591
		if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
592 593 594 595 596
			$pValue = $p->getAttribute('alt');
		} elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
			$pValue = $p->getAttribute('alt');
		} elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') {
			$pValue = $p->getAttribute('title');
597
		} elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
598 599
			$pValue = $p->getAttribute('value');
		} else {
600
			$pValue = unicodeTrim($this->innerText($p));
601
		}
602

603
		return $pValue;
604 605 606 607
	}

	/**
	 * Given an element with class="u-*", get the value of the URL
608
	 *
609 610 611 612 613 614 615
	 * @param DOMElement $u The element to parse
	 * @return string The plaintext value of $u, dependant on type
	 * @todo make this adhere to value-class
	 */
	public function parseU(\DOMElement $u) {
		if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
			$uValue = $u->getAttribute('href');
616
		} elseif (in_array($u->tagName, array('img', 'audio', 'video', 'source')) and $u->getAttribute('src') !== null) {
617 618 619 620
			$uValue = $u->getAttribute('src');
		} elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
			$uValue = $u->getAttribute('data');
		}
621

622 623 624
		if (isset($uValue)) {
			return $this->resolveUrl($uValue);
		}
625

626
		$classTitle = $this->parseValueClassTitle($u);
627

628 629 630 631 632 633 634
		if ($classTitle !== null) {
			return $classTitle;
		} elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
			return $u->getAttribute('title');
		} elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
			return $u->getAttribute('value');
		} else {
635
			return unicodeTrim($this->textContent($u));
636
		}
637 638 639 640
	}

	/**
	 * Given an element with class="dt-*", get the value of the datetime as a php date object
641
	 *
642
	 * @param DOMElement $dt The element to parse
643
	 * @param array $dates Array of dates processed so far
644 645
	 * @return string The datetime string found
	 */
646
	public function parseDT(\DOMElement $dt, &$dates = array()) {
647
		// Check for value-class pattern
648
		$valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
649
		$dtValue = false;
650

651
		if ($valueClassChildren->length > 0) {
652
			// They’re using value-class
653
			$dateParts = array();
654

655
			foreach ($valueClassChildren as $e) {
656 657 658 659 660 661
				if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
					$title = $e->getAttribute('title');
					if (!empty($title))
						$dateParts[] = $title;
				}
				elseif ($e->tagName == 'img' or $e->tagName == 'area') {
662 663 664 665 666 667 668
					// Use @alt
					$alt = $e->getAttribute('alt');
					if (!empty($alt))
						$dateParts[] = $alt;
				}
				elseif ($e->tagName == 'data') {
					// Use @value, otherwise innertext
669
					$value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
670 671 672 673 674
					if (!empty($value))
						$dateParts[] = $value;
				}
				elseif ($e->tagName == 'abbr') {
					// Use @title, otherwise innertext
675
					$title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
676 677 678 679 680
					if (!empty($title))
						$dateParts[] = $title;
				}
				elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
					// Use @datetime if available, otherwise innertext
681
					$dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
682 683 684 685 686
					if (!empty($dtAttr))
						$dateParts[] = $dtAttr;
				}
				else {
					if (!empty($e->nodeValue))
687
						$dateParts[] = unicodeTrim($e->nodeValue);
688 689 690 691 692 693 694 695
				}
			}

			// Look through dateParts
			$datePart = '';
			$timePart = '';
			foreach ($dateParts as $part) {
				// Is this part a full ISO8601 datetime?
696
				if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
697
					// Break completely, we’ve got our value.
698 699 700
					$dtValue = $part;
					break;
				} else {
701
					// Is the current part a valid time(+TZ?) AND no other time representation has been found?
702
					if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
703 704
						$timePart = $part;
					} elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
705
						// Is the current part a valid date AND no other date representation has been found?
706 707
						$datePart = $part;
					}
708 709 710 711 712 713 714 715

					if ( !empty($datePart) && !in_array($datePart, $dates) ) {
						$dates[] = $datePart;
					}

					$dtValue = '';

					if ( empty($datePart) && !empty($timePart) ) {
716
						$timePart = convertTimeFormat($timePart);
717 718 719 720 721 722
						$dtValue = unicodeTrim($timePart, 'T');
					}
					else if ( !empty($datePart) && empty($timePart) ) {
						$dtValue = rtrim($datePart, 'T');
					}
					else {
723
						$timePart = convertTimeFormat($timePart);
724 725
						$dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
					}
726 727 728 729 730 731 732 733 734 735
				}
			}
		} else {
			// Not using value-class (phew).
			if ($dt->tagName == 'img' or $dt->tagName == 'area') {
				// Use @alt
				// Is it an entire dt?
				$alt = $dt->getAttribute('alt');
				if (!empty($alt))
					$dtValue = $alt;
736
			} elseif (in_array($dt->tagName, array('data'))) {
737 738 739 740 741 742
				// Use @value, otherwise innertext
				// Is it an entire dt?
				$value = $dt->getAttribute('value');
				if (!empty($value))
					$dtValue = $value;
				else
743
					$dtValue = $this->textContent($dt);
744 745 746 747 748 749 750
			} elseif ($dt->tagName == 'abbr') {
				// Use @title, otherwise innertext
				// Is it an entire dt?
				$title = $dt->getAttribute('title');
				if (!empty($title))
					$dtValue = $title;
				else
751
					$dtValue = $this->textContent($dt);
752 753 754 755 756 757 758
			} elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
				// Use @datetime if available, otherwise innertext
				// Is it an entire dt?
				$dtAttr = $dt->getAttribute('datetime');
				if (!empty($dtAttr))
					$dtValue = $dtAttr;
				else
759
					$dtValue = $this->textContent($dt);
760
			} else {
761
				$dtValue = $this->textContent($dt);
762
			}
763

764
			if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
765 766
				$dates[] = $matches[0];
			}
767 768
		}

769
		/**
770
		 * if $dtValue is only a time and there are recently parsed dates,
771
		 * form the full date-time using the most recently parsed dt- value
772
		 */
773
		if ((preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates)) {
774
			$dtValue = convertTimeFormat($dtValue);
775 776 777
			$dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
		}

778
		return $dtValue;
779 780 781 782 783 784 785
	}

	/**
	 * 	Given the root element of some embedded markup, return a string representing that markup
	 *
	 * 	@param DOMElement $e The element to parse
	 * 	@return string $e’s innerHTML
786
	 *
787
	 * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
788 789 790
	 */
	public function parseE(\DOMElement $e) {
		$classTitle = $this->parseValueClassTitle($e);
791

792 793
		if ($classTitle !== null)
			return $classTitle;
794

795
		// Expand relative URLs within children of this element
Barnaby Walters's avatar
Barnaby Walters committed
796
		// TODO: as it is this is not relative to only children, make this .// and rerun tests
797 798
		$this->resolveChildUrls($e);

799
		$html = '';
800
		foreach ($e->childNodes as $node) {
801
			$html .= $node->ownerDocument->saveHTML($node);
802
		}
803

804
		$return = array(
805
			'html' => $html,
806
			'value' => unicodeTrim($this->innerText($e)),
807
		);
808 809 810 811 812 813 814

		// Language
		if ( $html_lang = $this->language($e) ) {
			$return['html-lang'] = $html_lang;
		}

		return $return;
815 816
	}

817 818 819 820 821 822
	private function removeTags(\DOMElement &$e, $tagName) {
		while(($r = $e->getElementsByTagName($tagName)) && $r->length) {
			$r->item(0)->parentNode->removeChild($r->item(0));
		}
	}

823 824
	/**
	 * Recursively parse microformats
825
	 *
826
	 * @param DOMElement $e The element to parse
827
	 * @param bool $is_backcompat Whether using backcompat parsing or not
828 829
	 * @return array A representation of the values contained within microformat $e
	 */
830
	public function parseH(\DOMElement $e, $is_backcompat = false) {
831
		// If it’s already been parsed (e.g. is a child mf), skip
832
		if ($this->parsed->contains($e)) {
833
			return null;
834
		}
835 836

		// Get current µf name
837
		$mfTypes = mfNamesFromElement($e, 'h-');
838 839 840 841

		// Initalise var to store the representation in
		$return = array();
		$children = array();
842
		$dates = array();
843

844 845
		// each rel-bookmark with an href attribute
		foreach ( $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $e) as $el )
846
		{
847 848 849 850
			$class = 'u-url';
			// rel-bookmark already has class attribute; append current value
			if ($el->hasAttribute('class')) {
				$class .= ' ' . $el->getAttribute('class');
851
			}
852
			$el->setAttribute('class', $class);
853 854
		}

855 856
		$subMFs = $this->getRootMF($e);

857
		// Handle nested microformats (h-*)
858 859
		foreach ( $subMFs as $subMF ) {

860 861
			// Parse
			$result = $this->parseH($subMF);
862

863
			// If result was already parsed, skip it
864
			if (null === $result) {
865
				continue;
866 867
			}

868 869
			// In most cases, the value attribute of the nested microformat should be the p- parsed value of the elemnt.
			// The only times this is different is when the microformat is nested under certain prefixes, which are handled below.
870 871 872
			$result['value'] = $this->parseP($subMF);

			// Does this µf have any property names other than h-*?
873
			$properties = nestedMfPropertyNamesFromElement($subMF);
874

875 876
			if (!empty($properties)) {
				// Yes! It’s a nested property µf
877 878 879 880 881 882 883 884 885 886
				foreach ($properties as $property => $prefixes) {
					// Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
					$prefixSpecificResult = $result;
					if (in_array('p-', $prefixes)) {
						$prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0];
					} elseif (in_array('e-', $prefixes)) {
						$eParsedResult = $this->parseE($subMF);
						$prefixSpecificResult['html'] = $eParsedResult['html'];
						$prefixSpecificResult['value'] = $eParsedResult['value'];
					} elseif (in_array('u-', $prefixes)) {
Gregor Morrill's avatar
Gregor Morrill committed
887
						$prefixSpecificResult['value'] = (empty($result['properties']['url'])) ? $this->parseU($subMF) : reset($result['properties']['url']);
888 889
					}
					$return[$property][] = $prefixSpecificResult;
890 891 892 893 894
				}
			} else {
				// No, it’s a child µf
				$children[] = $result;
			}
895

896 897 898 899 900 901 902 903
			// Make sure this sub-mf won’t get parsed as a µf or property
			// TODO: Determine if clearing this is required?
			$this->elementPrefixParsed($subMF, 'h');
			$this->elementPrefixParsed($subMF, 'p');
			$this->elementPrefixParsed($subMF, 'u');
			$this->elementPrefixParsed($subMF, 'dt');
			$this->elementPrefixParsed($subMF, 'e');
		}
904

905 906 907 908
		if($e->tagName == 'area') {
			$coords = $e->getAttribute('coords');
			$shape = $e->getAttribute('shape');
		}
909 910 911

		// Handle p-*
		foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
912
			if ($this->isElementParsed($p, 'p')) {
913
				continue;
914
			}
915 916

			$pValue = $this->parseP($p);
917

918
			// Add the value to the array for it’s p- properties
919
			foreach (mfNamesFromElement($p, 'p-') as $propName) {
920
				if (!empty($propName)) {
921
					$return[$propName][] = $pValue;
922
				}
923
			}
924

925 926 927 928 929
			// Make sure this sub-mf won’t get parsed as a top level mf
			$this->elementPrefixParsed($p, 'p');
		}

		// Handle u-*
930
		foreach ($this->xpath->query('.//*[contains(concat(" ",  @class)," u-")]', $e) as $u) {
931
			if ($this->isElementParsed($u, 'u')) {
932
				continue;
933
			}
934

935
			$uValue = $this->parseU($u);
936

937
			// Add the value to the array for it’s property types
938
			foreach (mfNamesFromElement($u, 'u-') as $propName) {
939 940
				$return[$propName][] = $uValue;
			}
941

942 943 944
			// Make sure this sub-mf won’t get parsed as a top level mf
			$this->elementPrefixParsed($u, 'u');
		}
945

946 947
		// Handle dt-*
		foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
948
			if ($this->isElementParsed($dt, 'dt')) {
949
				continue;
950
			}
951

952
			$dtValue = $this->parseDT($dt, $dates);
953

954 955
			if ($dtValue) {
				// Add the value to the array for dt- properties
956
				foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
957 958 959
					$return[$propName][] = $dtValue;
				}
			}
960

961 962 963 964
			// Make sure this sub-mf won’t get parsed as a top level mf
			$this->elementPrefixParsed($dt, 'dt');
		}

965
		// Handle e-*
966
		foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
967
			if ($this->isElementParsed($em, 'e')) {
968
				continue;
969
			}
970 971 972 973 974

			$eValue = $this->parseE($em);

			if ($eValue) {
				// Add the value to the array for e- properties
975
				foreach (mfNamesFromElement($em, 'e-') as $propName) {
976 977 978 979
					$return[$propName][] = $eValue;
				}
			}
			// Make sure this sub-mf won’t get parsed as a top level mf
980
			$this->elementPrefixParsed($em, 'e');
981 982
		}

983
		// Implied Properties
984
		// Check for p-name
985
		if (!array_key_exists('name', $return) && !$is_backcompat) {
986 987
			try {
				// Look for img @alt
988
				if (($e->tagName == 'img' or $e->tagName == 'area') and $e->getAttribute('alt') != '') {
989
					throw new Exception($e->getAttribute('alt'));
990
				}
991

992
				if ($e->tagName == 'abbr' and $e->hasAttribute('title')) {
993
					throw new Exception($e->getAttribute('title'));
994
				}
995

996 997
				// Look for nested img @alt
				foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
998 999
					$emNames = mfNamesFromElement($em, 'h-');
					if (empty($emNames) && $em->getAttribute('alt') != '') {
1000
						throw new Exception($em->getAttribute('alt'));
1001
					}
1002 1003
				}

1004 1005
				// Look for nested area @alt
				foreach ($this->xpath->query('./area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
1006 1007
					$emNames = mfNamesFromElement($em, 'h-');
					if (empty($emNames) && $em->getAttribute('alt') != '') {
1008
						throw new Exception($em->getAttribute('alt'));
1009
					}
1010 1011
				}

1012 1013
				// Look for double nested img @alt
				foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
1014 1015
					$emNames = mfNamesFromElement($em, 'h-');
					if (empty($emNames) && $em->getAttribute('alt') != '') {
1016
						throw new Exception($em->getAttribute('alt'));
1017
					}
1018 1019
				}

1020 1021
				// Look for double nested img @alt
				foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
1022 1023
					$emNames = mfNamesFromElement($em, 'h-');
					if (empty($emNames) && $em->getAttribute('alt') != '') {
1024
						throw new Exception($em->getAttribute('alt'));
1025
					}
1026 1027
				}

1028
				throw new Exception($this->innerText($e, true));
1029
			} catch (Exception $exc) {
1030
				$return['name'][] = unicodeTrim($exc->getMessage());
1031 1032 1033 1034
			}
		}

		// Check for u-photo
1035
		if (!array_key_exists('photo', $return) && !$is_backcompat) {
1036

1037
			$photo = $this->parseImpliedPhoto($e);
1038

1039 1040
			if ($photo !== false) {
				$return['photo'][] = $this->resolveUrl($photo);
1041
			}
1042

1043 1044 1045
		}

		// Check for u-url
1046
		if (!array_key_exists('url', $return) && !$is_backcompat) {
1047
			// Look for img @src
1048
			if ($e->tagName == 'a' or $e->tagName == 'area') {
1049
				$url = $e->getAttribute('href');
1050
			}
1051

1052
			// Look for nested a @href
1053
			foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
1054 1055 1056 1057 1058
				$emNames = mfNamesFromElement($em, 'h-');
				if (empty($emNames)) {
					$url = $em->getAttribute('href');
					break;
				}
1059
			}
1060 1061 1062

			// Look for nested area @src
			foreach ($this->xpath->query('./area[count(preceding-sibling::area)+count(following-sibling::area)=0]', $e) as $em) {
1063 1064 1065 1066 1067
				$emNames = mfNamesFromElement($em, 'h-');
				if (empty($emNames)) {
					$url = $em->getAttribute('href');
					break;
				}
1068
			}
1069

1070
			if (!empty($url)) {
1071
				$return['url'][] = $this->resolveUrl($url);
1072
			}
1073 1074
		}

1075
		// Language
1076 1077 1078
		if ( $html_lang = $this->language($e) ) {
			$return['html-lang'] = $html_lang;
		}
1079

1080 1081
		// Make sure things are in alphabetical order
		sort($mfTypes);
1082

1083 1084 1085 1086 1087
		// Phew. Return the final result.
		$parsed = array(
			'type' => $mfTypes,
			'properties' => $return
		);
1088 1089

		if (!empty($shape)) {
1090
			$parsed['shape'] = $shape;
1091 1092 1093
		}

		if (!empty($coords)) {
1094
			$parsed['coords'] = $coords;
1095 1096 1097
		}

		if (!empty($children)) {
1098
			$parsed['children'] = array_values(array_filter($children));
1099
		}
1100 1101
		return $parsed;
	}
1102

1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
	/**
	 * @see http://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties
	 */
	public function parseImpliedPhoto(\DOMElement $e) {

		if ($e->tagName == 'img') {
			return $e->getAttribute('src');
		}

		if ($e->tagName == 'object' && $e->hasAttribute('data')) {
			return $e->getAttribute('data');
		}

		$xpaths = array(
			'./img',
			'./object',
			'./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img',
			'./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/object',
		);

		foreach ($xpaths as $path) {
			$els = $this->xpath->query($path, $e);

			if ($els->length == 1) {
				$el = $els->item(0);
				$hClasses = mfNamesFromElement($el, 'h-');

				// no nested h-
				if (empty($hClasses)) {

					if ($el->tagName == 'img') {
						return $el->getAttribute('src');
					} else if ($el->tagName == 'object' && $el->getAttribute('data') != '') {
						return $el->getAttribute('data');
					}

				} // no nested h-
			}
		}

		// no implied photo
		return false;
	}

Barnaby Walters's avatar
Barnaby Walters committed
1147 1148
	/**
	 * Parse Rels and Alternatives
1149 1150
	 *
	 * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page