From 2e1f632e822b93e498486ebe2f30131d10ab964b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Girault?= Date: Fri, 6 Feb 2015 00:44:20 +0100 Subject: [PATCH] First version --- .gitignore | 1 + README.md | 68 ++++++++++++++++ composer.json | 23 ++++++ src/Nokogiri.php | 78 ++++++++++++++++++ src/Parser.php | 180 +++++++++++++++++++++++++++++++++++++++++ tests/NokogiriTest.php | 50 ++++++++++++ 6 files changed, 400 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 composer.json create mode 100644 src/Nokogiri.php create mode 100644 src/Parser.php create mode 100644 tests/NokogiriTest.php diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..61ead86 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/vendor diff --git a/README.md b/README.md new file mode 100644 index 0000000..0b83e1a --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +Nokogiri +======== + +Cuts through XML like a breeze. + +Examples +-------- + +Given this XML: + +```xml +

+ Lorem ipsum dolor sit amet. +

+``` + +Cutting it at the twentieth character... + +```php +$Nokogiri = new Nokogiri\Nokogiri(); +$Nokogiri->cut($xml, 20); +``` + +Would return: + +```xml +

+ Lorem ipsum dolor sit. +

+``` + +Cutting it at the eleventh character... + +```php +$Nokogiri->cut($xml, 11); +``` + +Would return: + +```xml +

+ Lorem ipsum +

+``` + +Note that the blank characters between tags are not taken into account. + +Pitfalls +-------- + +For now, Nokogori won't work properly with string that are not enclosed +in a proper tag, like that: + +```xml +

+ Some unenclosed string + Lorem ipsum +

+``` + +Notes +----- + +The implementation is probably shitty, as I don't know anything about writing a +decent parser... + +Also, the implementation of the parser itself is kind of tied to the class using +it. It is obviously bad but it works :grin: diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..0918955 --- /dev/null +++ b/composer.json @@ -0,0 +1,23 @@ +{ + "name": "fg/nokogiri", + "description": "Cuts through XML like a breeze.", + "version": "0.1.0", + "authors": [{ + "name": "FĂ©lix Girault", + "email": "felix.girault@gmail.com", + "homepage": "http://www.felix-girault.fr", + "role": "Developer" + }], + "keywords": [ + "xml", + "cut", + "truncate" + ], + "license": "MIT", + "homepage": "http://github.com/felixgirault/essence", + "autoload": { + "psr-4": { + "Nokogiri\\": "src/" + } + } +} diff --git a/src/Nokogiri.php b/src/Nokogiri.php new file mode 100644 index 0000000..63f0d6c --- /dev/null +++ b/src/Nokogiri.php @@ -0,0 +1,78 @@ +on( + Parser::OPENED_TAG_EVENT, + function($tag) use (&$opened) { + $opened[] = $tag; + } + ); + + $Parser->on( + Parser::PARSED_TAG_CONTENTS_EVENT, + function($contents, $i) use (&$opened, &$count, &$position, $limit) { + if ($this->_isWhitespace($contents)) { + return; + } + + $count += strlen($contents); + + if ($count >= $limit) { + $position = $i - ($count - $limit); + return false; + } + } + ); + + $Parser->on( + Parser::CLOSED_TAG_EVENT, + function() use (&$opened) { + array_pop($opened); + } + ); + + $Parser->parse($xml); + + return $position + ? $this->_enclose($xml, $position, $opened) + : $xml; + } + + /** + * + */ + protected function _isWhitespace($string) { + return preg_match('~\s+~i', $string); + } + + /** + * + */ + protected function _enclose($xml, $position, $tags) { + $xml = substr($xml, 0, $position); + + while ($tag = array_pop($tags)) { + $xml .= ""; + } + + return $xml; + } +} diff --git a/src/Parser.php b/src/Parser.php new file mode 100644 index 0000000..d5bb86c --- /dev/null +++ b/src/Parser.php @@ -0,0 +1,180 @@ +_observers[$event] = $callback; + } + + /** + * + */ + protected function _emit($event) { + if (isset($this->_observers[$event])) { + $continue = call_user_func_array( + $this->_observers[$event], + array_slice(func_get_args(), 1) + ); + + $this->_continue = ($continue !== false); + } + } + + /** + * + */ + public function parse($xml) { + for ($i = 0; $i < strlen($xml); $i++) { + $char = $xml[$i]; + + switch ($this->_state) { + case self::PARSING_OPENING_TAG: + $this->_parseOpeningTag($char); + break; + + case self::PARSING_TAG_ATTRIBUTES: + $this->_parseTagAttributes($char); + break; + + case self::PARSING_TAG_CONTENTS: + $this->_parseTagContents($char, $i); + break; + + case self::PARSING_CLOSING_TAG: + $this->_parseClosingTag($char); + break; + } + + if (!$this->_continue) { + break; + } + } + } + + /** + * + */ + protected function _parseOpeningTag($char) { + switch ($char) { + // we're in fact parsing a closing tag + case '/': + $this->_state = self::PARSING_CLOSING_TAG; + break; + + // we're beggining to parse attributes + // we know the name of the tag we just opened + case ' ': + $this->_state = self::PARSING_TAG_ATTRIBUTES; + $this->_emit(self::OPENED_TAG_EVENT, $this->_tagName); + $this->_tagName = ''; + break; + + // we're reaching the end of an opening tag + // we know the name of the tag we just opened + // we can begin to store the tag's contents for later use + case '>': + $this->_state = self::PARSING_TAG_CONTENTS; + $this->_emit(self::OPENED_TAG_EVENT, $this->_tagName); + $this->_tagName = ''; + break; + + // we're storing the tag's name for later use + default: + $this->_tagName .= $char; + break; + } + } + + /** + * + */ + protected function _parseTagAttributes($char) { + switch ($char) { + // we're reaching the end of an opening tag + // we can begin to store the tag's contents for later use + case '>': + $this->_state = self::PARSING_TAG_CONTENTS; + $this->_contents = ''; + break; + } + } + + /** + * + */ + protected function _parseTagContents($char, $i) { + switch ($char) { + // we're reaching the start of a tag + // we can begin to store the tag's name for later use + case '<': + $this->_state = self::PARSING_OPENING_TAG; + $this->_emit(self::PARSED_TAG_CONTENTS_EVENT, $this->_contents, $i); + $this->_contents = ''; + break; + + // we're storing the tag's contents for later use + default: + $this->_contents .= $char; + break; + } + } + + /** + * + */ + protected function _parseClosingTag($char) { + switch ($char) { + // we're reaching the end of a closing tag + case '>': + $this->_state = self::PARSING_TAG_CONTENTS; + $this->_emit(self::CLOSED_TAG_EVENT); + break; + } + } +} diff --git a/tests/NokogiriTest.php b/tests/NokogiriTest.php new file mode 100644 index 0000000..346c2fa --- /dev/null +++ b/tests/NokogiriTest.php @@ -0,0 +1,50 @@ +Nokogiri = new Nokogiri\Nokogiri(); + } + + /** + * + */ + public function testCut() { + $xml = << + First + First + + First + First + +

+XML; + + $expected = << + First + First + + Fi

+XML; + + $this->assertEquals( + $expected, + $this->Nokogiri->cut($xml, 12) + ); + } +}