Skip to content

Commit 1abb9fc

Browse files
authored
Update KBBIModel.php
1 parent 6d1fc56 commit 1abb9fc

File tree

1 file changed

+88
-2
lines changed

1 file changed

+88
-2
lines changed

KBBIModel.php

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,16 @@ private function _cleanText($text)
3737
return preg_replace('/\s+/', ' ', trim($text));
3838
}
3939

40-
public function searchWord($word)
40+
private function _cleanWord($word)
4141
{
42-
$htmlData = $this->_fetchHtml($word);
42+
// Remove non-alphanumeric characters except spaces
43+
$cleanWord = preg_replace('/[^a-zA-Z0-9\s]/', '', $word);
44+
// Replace multiple spaces with a single space
45+
return preg_replace('/\s+/', ' ', strtolower(trim($cleanWord)));
46+
}
4347

48+
private function _parserV1($htmlData, $word)
49+
{
4450
$doc = new DOMDocument();
4551
libxml_use_internal_errors(true);
4652
$doc->loadHTML($htmlData);
@@ -79,11 +85,91 @@ public function searchWord($word)
7985
}
8086

8187
$dataResponse[$i] = [
88+
'word' => $word,
8289
'lema' => $lema,
8390
'arti' => $arti,
8491
'tesaurusLink' => $tesaurusLink,
8592
];
8693
}
94+
}
95+
96+
private function _parserV2($htmlData, $word)
97+
{
98+
$doc = new DOMDocument();
99+
libxml_use_internal_errors(true);
100+
$doc->loadHTML($htmlData);
101+
libxml_clear_errors();
102+
103+
$xpath = new DOMXPath($doc);
104+
$dataResponse = [];
105+
106+
$contentDiv = $xpath->query("//div[contains(@class, 'container body-content')]")->item(0);
107+
if (!$contentDiv) {
108+
return false;
109+
}
110+
111+
// Mengambil semua elemen h2 dalam div body-content
112+
$h2Elements = $xpath->query(".//h2[contains(@style, 'margin-bottom:3px')]", $contentDiv);
113+
foreach ($h2Elements as $i => $h2Element) {
114+
// Mengambil lema dari link a di dalam span rootword
115+
$lemaLink = $xpath->query(".//span[contains(@class, 'rootword')]/a", $h2Element)->item(0);
116+
$lema = '';
117+
if ($lemaLink) {
118+
$lema = $this->_cleanText($lemaLink->nodeValue);
119+
}
120+
121+
// Mengambil link Tesaurus
122+
$tesaurusLink = '';
123+
$tesaurusAnchor = $xpath->query(".//p/a[contains(@href, 'tematis/lema')]", $h2Element)->item(0);
124+
if ($tesaurusAnchor) {
125+
$tesaurusLink = $tesaurusAnchor->getAttribute('href');
126+
} else {
127+
$tesaurusLink = "http://tesaurus.kemdikbud.go.id/tematis/lema/".$word;
128+
}
129+
130+
// Mengambil deskripsi/arti dari ul/li setelah h2
131+
$ulElement = $xpath->query("following-sibling::ul[@class='adjusted-par'][1]", $h2Element)->item(0);
132+
$arti = [];
133+
if ($ulElement) {
134+
$listItems = $xpath->query(".//li", $ulElement);
135+
foreach ($listItems as $j => $listItem) {
136+
$deskripsi = $this->_cleanText($listItem->nodeValue);
137+
$arti[] = ['deskripsi' => $deskripsi];
138+
}
139+
}
140+
141+
// Menyimpan data dalam $dataResponse
142+
if (!empty($lema) && !empty($arti)) {
143+
$dataResponse[] = [
144+
'word' => $word,
145+
'lema' => $lema . " » " . $word,
146+
'arti' => $arti,
147+
'tesaurusLink' => $tesaurusLink,
148+
];
149+
}
150+
}
151+
152+
return $dataResponse;
153+
}
154+
155+
public function searchWord($word)
156+
{
157+
// Clean the word
158+
$cleanWord = $this->_cleanWord($word);
159+
160+
$htmlData = $this->_fetchHtml($word);
161+
162+
$dataResponse = [];
163+
164+
$_parserV1 = $this->_parserV1($htmlData, $cleanWord);
165+
if(count($_parserV1)){
166+
$dataResponse = $_parserV1;
167+
} else {
168+
$_parserV2 = $this->_parserV2($htmlData, $cleanWord);
169+
if(count($_parserV2)){
170+
$dataResponse = $_parserV2;
171+
}
172+
}
87173

88174
return count($dataResponse) ? $dataResponse : false;
89175
}

0 commit comments

Comments
 (0)