-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_type3.php
92 lines (79 loc) · 2.94 KB
/
parse_type3.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
<?php
// -*- coding: utf-8 -*-
/* parse_type3.php
* parse result page from named Type-3 platform
*
* released under GPLv3
*/
/*
* <ul class="product_list">
* <li><div class="book_con">
* <h4>{title}
* <p class="txt">{author} 저/ {publisher} / {date}
* <div class="etc">{volume} 권
* <p class="txt_body">{description}
*/
function parse_type3($html)
{
$odom = new DOMDocument("1.0", "UTF-8");
$root = $odom->appendChild( $odom->createElement("result") );
// parsing
$html2 = <<<EOD
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body>
$html
</body>
</html>
EOD;
$doc = new DOMDocument;
@$doc->loadHTML($html2);
$xpath = new DOMXpath($doc);
$secs = $xpath->query("//ul[@class='product_list']/li");
foreach ($secs as $sec) {
//=== (1): <span>
$part = $xpath->query($sec->getNodePath()."/span[@class='tit_img']/a");
if ($part->length == 0) {
// maybe no result
continue;
}
$book = $root->appendChild( $odom->createElement("book") );
$node = $book->appendChild( $odom->createElement("thumb") );
$url = $part->item(0)->getElementsByTagName("img")->item(0)->getAttribute("src");
$node->appendChild( $odom->createTextNode( $url ) );
$node = $book->appendChild( $odom->createElement("url") );
$url = $part->item(0)->getAttribute("href");
$node->appendChild( $odom->createTextNode( $url ) );
//=== (2): <div>
$secPath = $sec->getNodePath()."/div[@class='book_con']";
//-- {title}
$part = $xpath->query($secPath."/h4");
$node = $book->appendChild( $odom->createElement("title") );
$node->appendChild( $odom->createCDATASection( trim($part->item(0)->nodeValue) ) );
//-- {author} 저/ {publisher} / {pubdate}
$part = $xpath->query($secPath."/p[@class='txt']");
$num = preg_match("#^\s*(.*\S) 저\s*/\s*(.*\S)\s*/\s*((\d{4})\w (\d{2})\w (\d{2})\w)#u", $part->item(0)->nodeValue, $match);
if ($num > 0) {
$node = $book->appendChild( $odom->createElement("author") );
$node->appendChild( $odom->createCDATASection( $match[1] ) );
$node = $book->appendChild( $odom->createElement("publisher") );
$node->appendChild( $odom->createCDATASection( $match[2] ) );
$book->appendChild( $odom->createElement("pubdate", "$match[4]-$match[5]-$match[6]") );
}
//-- {description}
$part = $xpath->query($secPath."/p[@class='txt_body']");
$node = $book->appendChild( $odom->createElement("description") );
$node->appendChild( $odom->createCDATASection( trim($part->item(0)->nodeValue) ) );
//-- {volume}권
$part = $xpath->query($secPath."/div[@class='etc']");
$num = preg_match("/(\d+)\s*권\s/", $part->item(0)->nodeValue, $match);
if ($num > 0)
$book->appendChild( $odom->createElement("volume", $match[1]) );
$book->appendChild( $odom->createElement("support_mobile", "yes") );
// no info about book format in search result page
}
return $odom;
}
?>