Skip to content

Commit

Permalink
First version
Browse files Browse the repository at this point in the history
  • Loading branch information
felixgirault committed Feb 5, 2015
0 parents commit 2e1f632
Show file tree
Hide file tree
Showing 6 changed files with 400 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/vendor
68 changes: 68 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
Nokogiri
========

Cuts through XML like a breeze.

Examples
--------

Given this XML:

```xml
<p>
<span>Lorem ipsum dolor <em>sit amet</em>.</span>
</p>
```

Cutting it at the twentieth character...

```php
$Nokogiri = new Nokogiri\Nokogiri();
$Nokogiri->cut($xml, 20);
```

Would return:

```xml
<p>
<span>Lorem ipsum dolor <em>sit</em>.</span>
</p>
```

Cutting it at the eleventh character...

```php
$Nokogiri->cut($xml, 11);
```

Would return:

```xml
<p>
<span>Lorem ipsum</span>
</p>
```

Note that the blank characters between tags are not taken into account.

Pitfalls
--------

For now, Nokogori won't work properly with string that are not enclosed
in a proper tag, like that:

```xml
<p>
Some unenclosed string
<span>Lorem ipsum</span>
</p>
```

Notes
-----

The implementation is probably shitty, as I don't know anything about writing a
decent parser...

Also, the implementation of the parser itself is kind of tied to the class using
it. It is obviously bad but it works :grin:
23 changes: 23 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"name": "fg/nokogiri",
"description": "Cuts through XML like a breeze.",
"version": "0.1.0",
"authors": [{
"name": "Félix Girault",
"email": "[email protected]",
"homepage": "http://www.felix-girault.fr",
"role": "Developer"
}],
"keywords": [
"xml",
"cut",
"truncate"
],
"license": "MIT",
"homepage": "http://github.com/felixgirault/essence",
"autoload": {
"psr-4": {
"Nokogiri\\": "src/"
}
}
}
78 changes: 78 additions & 0 deletions src/Nokogiri.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
<?php

namespace Nokogiri;

use Nokogiri\Parser;



/**
*
*/
class Nokogiri {

/**
*
*/
public function cut($xml, $limit) {
$Parser = new Parser();
$opened = [];
$position = 0;

$Parser->on(
Parser::OPENED_TAG_EVENT,
function($tag) use (&$opened) {
$opened[] = $tag;
}
);

$Parser->on(
Parser::PARSED_TAG_CONTENTS_EVENT,
function($contents, $i) use (&$opened, &$count, &$position, $limit) {
if ($this->_isWhitespace($contents)) {
return;
}

$count += strlen($contents);

if ($count >= $limit) {
$position = $i - ($count - $limit);
return false;
}
}
);

$Parser->on(
Parser::CLOSED_TAG_EVENT,
function() use (&$opened) {
array_pop($opened);
}
);

$Parser->parse($xml);

return $position
? $this->_enclose($xml, $position, $opened)
: $xml;
}

/**
*
*/
protected function _isWhitespace($string) {
return preg_match('~\s+~i', $string);
}

/**
*
*/
protected function _enclose($xml, $position, $tags) {
$xml = substr($xml, 0, $position);

while ($tag = array_pop($tags)) {
$xml .= "</$tag>";
}

return $xml;
}
}
180 changes: 180 additions & 0 deletions src/Parser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
<?php

namespace Nokogiri;



/**
*
*/
class Parser {

//
const OPENED_TAG_EVENT = 0;

//
const PARSED_TAG_CONTENTS_EVENT = 1;

//
const CLOSED_TAG_EVENT = 2;

//
const PARSING_OPENING_TAG = 0;

//
const PARSING_TAG_ATTRIBUTES = 1;

//
const PARSING_TAG_CONTENTS = 2;

//
const PARSING_CLOSING_TAG = 3;

//
protected $_observers = [];

//
protected $_continue = true;

//
protected $_contents = '';

//
protected $_tagName = '';

//
protected $_state = self::PARSING_TAG_CONTENTS;

/**
*
*/
public function on($event, callable $callback) {
$this->_observers[$event] = $callback;
}

/**
*
*/
protected function _emit($event) {
if (isset($this->_observers[$event])) {
$continue = call_user_func_array(
$this->_observers[$event],
array_slice(func_get_args(), 1)
);

$this->_continue = ($continue !== false);
}
}

/**
*
*/
public function parse($xml) {
for ($i = 0; $i < strlen($xml); $i++) {
$char = $xml[$i];

switch ($this->_state) {
case self::PARSING_OPENING_TAG:
$this->_parseOpeningTag($char);
break;

case self::PARSING_TAG_ATTRIBUTES:
$this->_parseTagAttributes($char);
break;

case self::PARSING_TAG_CONTENTS:
$this->_parseTagContents($char, $i);
break;

case self::PARSING_CLOSING_TAG:
$this->_parseClosingTag($char);
break;
}

if (!$this->_continue) {
break;
}
}
}

/**
*
*/
protected function _parseOpeningTag($char) {
switch ($char) {
// we're in fact parsing a closing tag
case '/':
$this->_state = self::PARSING_CLOSING_TAG;
break;

// we're beggining to parse attributes
// we know the name of the tag we just opened
case ' ':
$this->_state = self::PARSING_TAG_ATTRIBUTES;
$this->_emit(self::OPENED_TAG_EVENT, $this->_tagName);
$this->_tagName = '';
break;

// we're reaching the end of an opening tag
// we know the name of the tag we just opened
// we can begin to store the tag's contents for later use
case '>':
$this->_state = self::PARSING_TAG_CONTENTS;
$this->_emit(self::OPENED_TAG_EVENT, $this->_tagName);
$this->_tagName = '';
break;

// we're storing the tag's name for later use
default:
$this->_tagName .= $char;
break;
}
}

/**
*
*/
protected function _parseTagAttributes($char) {
switch ($char) {
// we're reaching the end of an opening tag
// we can begin to store the tag's contents for later use
case '>':
$this->_state = self::PARSING_TAG_CONTENTS;
$this->_contents = '';
break;
}
}

/**
*
*/
protected function _parseTagContents($char, $i) {
switch ($char) {
// we're reaching the start of a tag
// we can begin to store the tag's name for later use
case '<':
$this->_state = self::PARSING_OPENING_TAG;
$this->_emit(self::PARSED_TAG_CONTENTS_EVENT, $this->_contents, $i);
$this->_contents = '';
break;

// we're storing the tag's contents for later use
default:
$this->_contents .= $char;
break;
}
}

/**
*
*/
protected function _parseClosingTag($char) {
switch ($char) {
// we're reaching the end of a closing tag
case '>':
$this->_state = self::PARSING_TAG_CONTENTS;
$this->_emit(self::CLOSED_TAG_EVENT);
break;
}
}
}
Loading

0 comments on commit 2e1f632

Please sign in to comment.