First version

felixgirault · Feb 5, 2015 · 2e1f632 · 2e1f632
commit 2e1f632
Show file tree

Hide file tree

Showing 6 changed files with 400 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/vendor
diff --git a/README.md b/README.md
@@ -0,0 +1,68 @@
+Nokogiri
+========
+
+Cuts through XML like a breeze.
+
+Examples
+--------
+
+Given this XML:
+
+```xml
+<p>
+	<span>Lorem ipsum dolor <em>sit amet</em>.</span>
+</p>
+```
+
+Cutting it at the twentieth character...
+
+```php
+$Nokogiri = new Nokogiri\Nokogiri();
+$Nokogiri->cut($xml, 20);
+```
+
+Would return:
+
+```xml
+<p>
+	<span>Lorem ipsum dolor <em>sit</em>.</span>
+</p>
+```
+
+Cutting it at the eleventh character...
+
+```php
+$Nokogiri->cut($xml, 11);
+```
+
+Would return:
+
+```xml
+<p>
+	<span>Lorem ipsum</span>
+</p>
+```
+
+Note that the blank characters between tags are not taken into account.
+
+Pitfalls
+--------
+
+For now, Nokogori won't work properly with string that are not enclosed
+in a proper tag, like that:
+
+```xml
+<p>
+	Some unenclosed string
+	<span>Lorem ipsum</span>
+</p>
+```
+
+Notes
+-----
+
+The implementation is probably shitty, as I don't know anything about writing a
+decent parser...
+
+Also, the implementation of the parser itself is kind of tied to the class using
+it. It is obviously bad but it works :grin:
diff --git a/composer.json b/composer.json
@@ -0,0 +1,23 @@
+{
+	"name": "fg/nokogiri",
+	"description": "Cuts through XML like a breeze.",
+	"version": "0.1.0",
+	"authors": [{
+		"name": "Félix Girault",
+		"email": "[email protected]",
+		"homepage": "http://www.felix-girault.fr",
+		"role": "Developer"
+	}],
+	"keywords": [
+		"xml",
+		"cut",
+		"truncate"
+	],
+	"license": "MIT",
+	"homepage": "http://github.com/felixgirault/essence",
+	"autoload": {
+		"psr-4": {
+			"Nokogiri\\": "src/"
+		}
+	}
+}
diff --git a/src/Nokogiri.php b/src/Nokogiri.php
@@ -0,0 +1,78 @@
+<?php
+
+namespace Nokogiri;
+
+use Nokogiri\Parser;
+
+
+
+/**
+ *
+ */
+class Nokogiri {
+
+	/**
+	 *
+	 */
+	public function cut($xml, $limit) {
+		$Parser = new Parser();
+		$opened = [];
+		$position = 0;
+
+		$Parser->on(
+			Parser::OPENED_TAG_EVENT,
+			function($tag) use (&$opened) {
+				$opened[] = $tag;
+			}
+		);
+
+		$Parser->on(
+			Parser::PARSED_TAG_CONTENTS_EVENT,
+			function($contents, $i) use (&$opened, &$count, &$position, $limit) {
+				if ($this->_isWhitespace($contents)) {
+					return;
+				}
+
+				$count += strlen($contents);
+
+				if ($count >= $limit) {
+					$position = $i - ($count - $limit);
+					return false;
+				}
+			}
+		);
+
+		$Parser->on(
+			Parser::CLOSED_TAG_EVENT,
+			function() use (&$opened) {
+				array_pop($opened);
+			}
+		);
+
+		$Parser->parse($xml);
+
+		return $position
+			? $this->_enclose($xml, $position, $opened)
+			: $xml;
+	}
+
+	/**
+	 *
+	 */
+	protected function _isWhitespace($string) {
+		return preg_match('~\s+~i', $string);
+	}
+
+	/**
+	 *
+	 */
+	protected function _enclose($xml, $position, $tags) {
+		$xml = substr($xml, 0, $position);
+
+		while ($tag = array_pop($tags)) {
+			$xml .= "</$tag>";
+		}
+
+		return $xml;
+	}
+}
diff --git a/src/Parser.php b/src/Parser.php
@@ -0,0 +1,180 @@
+<?php
+
+namespace Nokogiri;
+
+
+
+/**
+ *
+ */
+class Parser {
+
+	//
+	const OPENED_TAG_EVENT = 0;
+
+	//
+	const PARSED_TAG_CONTENTS_EVENT = 1;
+
+	//
+	const CLOSED_TAG_EVENT = 2;
+
+	//
+	const PARSING_OPENING_TAG = 0;
+
+	//
+	const PARSING_TAG_ATTRIBUTES = 1;
+
+	//
+	const PARSING_TAG_CONTENTS = 2;
+
+	//
+	const PARSING_CLOSING_TAG = 3;
+
+	//
+	protected $_observers = [];
+
+	//
+	protected $_continue = true;
+
+	//
+	protected $_contents = '';
+
+	//
+	protected $_tagName = '';
+
+	//
+	protected $_state = self::PARSING_TAG_CONTENTS;
+
+	/**
+	 *
+	 */
+	public function on($event, callable $callback) {
+		$this->_observers[$event] = $callback;
+	}
+
+	/**
+	 *
+	 */
+	protected function _emit($event) {
+		if (isset($this->_observers[$event])) {
+			$continue = call_user_func_array(
+				$this->_observers[$event],
+				array_slice(func_get_args(), 1)
+			);
+
+			$this->_continue = ($continue !== false);
+		}
+	}
+
+	/**
+	 *
+	 */
+	public function parse($xml) {
+		for ($i = 0; $i < strlen($xml); $i++) {
+			$char = $xml[$i];
+
+			switch ($this->_state) {
+				case self::PARSING_OPENING_TAG:
+					$this->_parseOpeningTag($char);
+					break;
+
+				case self::PARSING_TAG_ATTRIBUTES:
+					$this->_parseTagAttributes($char);
+					break;
+
+				case self::PARSING_TAG_CONTENTS:
+					$this->_parseTagContents($char, $i);
+					break;
+
+				case self::PARSING_CLOSING_TAG:
+					$this->_parseClosingTag($char);
+					break;
+			}
+
+			if (!$this->_continue) {
+				break;
+			}
+		}
+	}
+
+	/**
+	 *
+	 */
+	protected function _parseOpeningTag($char) {
+		switch ($char) {
+			// we're in fact parsing a closing tag
+			case '/':
+				$this->_state = self::PARSING_CLOSING_TAG;
+				break;
+
+			// we're beggining to parse attributes
+			// we know the name of the tag we just opened
+			case ' ':
+				$this->_state = self::PARSING_TAG_ATTRIBUTES;
+				$this->_emit(self::OPENED_TAG_EVENT, $this->_tagName);
+				$this->_tagName = '';
+				break;
+
+			// we're reaching the end of an opening tag
+			// we know the name of the tag we just opened
+			// we can begin to store the tag's contents for later use
+			case '>':
+				$this->_state = self::PARSING_TAG_CONTENTS;
+				$this->_emit(self::OPENED_TAG_EVENT, $this->_tagName);
+				$this->_tagName = '';
+				break;
+
+			// we're storing the tag's name for later use
+			default:
+				$this->_tagName .= $char;
+				break;
+		}
+	}
+
+	/**
+	 *
+	 */
+	protected function _parseTagAttributes($char) {
+		switch ($char) {
+			// we're reaching the end of an opening tag
+			// we can begin to store the tag's contents for later use
+			case '>':
+				$this->_state = self::PARSING_TAG_CONTENTS;
+				$this->_contents = '';
+				break;
+		}
+	}
+
+	/**
+	 *
+	 */
+	protected function _parseTagContents($char, $i) {
+		switch ($char) {
+			// we're reaching the start of a tag
+			// we can begin to store the tag's name for later use
+			case '<':
+				$this->_state = self::PARSING_OPENING_TAG;
+				$this->_emit(self::PARSED_TAG_CONTENTS_EVENT, $this->_contents, $i);
+				$this->_contents = '';
+				break;
+
+			// we're storing the tag's contents for later use
+			default:
+				$this->_contents .= $char;
+				break;
+		}
+	}
+
+	/**
+	 *
+	 */
+	protected function _parseClosingTag($char) {
+		switch ($char) {
+			// we're reaching the end of a closing tag
+			case '>':
+				$this->_state = self::PARSING_TAG_CONTENTS;
+				$this->_emit(self::CLOSED_TAG_EVENT);
+				break;
+		}
+	}
+}