@@ -37,10 +37,16 @@ private function _cleanText($text)
37
37
return preg_replace ('/\s+/ ' , ' ' , trim ($ text ));
38
38
}
39
39
40
- public function searchWord ($ word )
40
+ private function _cleanWord ($ word )
41
41
{
42
- $ htmlData = $ this ->_fetchHtml ($ word );
42
+ // Remove non-alphanumeric characters except spaces
43
+ $ cleanWord = preg_replace ('/[^a-zA-Z0-9\s]/ ' , '' , $ word );
44
+ // Replace multiple spaces with a single space
45
+ return preg_replace ('/\s+/ ' , ' ' , strtolower (trim ($ cleanWord )));
46
+ }
43
47
48
+ private function _parserV1 ($ htmlData , $ word )
49
+ {
44
50
$ doc = new DOMDocument ();
45
51
libxml_use_internal_errors (true );
46
52
$ doc ->loadHTML ($ htmlData );
@@ -79,11 +85,91 @@ public function searchWord($word)
79
85
}
80
86
81
87
$ dataResponse [$ i ] = [
88
+ 'word ' => $ word ,
82
89
'lema ' => $ lema ,
83
90
'arti ' => $ arti ,
84
91
'tesaurusLink ' => $ tesaurusLink ,
85
92
];
86
93
}
94
+ }
95
+
96
+ private function _parserV2 ($ htmlData , $ word )
97
+ {
98
+ $ doc = new DOMDocument ();
99
+ libxml_use_internal_errors (true );
100
+ $ doc ->loadHTML ($ htmlData );
101
+ libxml_clear_errors ();
102
+
103
+ $ xpath = new DOMXPath ($ doc );
104
+ $ dataResponse = [];
105
+
106
+ $ contentDiv = $ xpath ->query ("//div[contains(@class, 'container body-content')] " )->item (0 );
107
+ if (!$ contentDiv ) {
108
+ return false ;
109
+ }
110
+
111
+ // Mengambil semua elemen h2 dalam div body-content
112
+ $ h2Elements = $ xpath ->query (".//h2[contains(@style, 'margin-bottom:3px')] " , $ contentDiv );
113
+ foreach ($ h2Elements as $ i => $ h2Element ) {
114
+ // Mengambil lema dari link a di dalam span rootword
115
+ $ lemaLink = $ xpath ->query (".//span[contains(@class, 'rootword')]/a " , $ h2Element )->item (0 );
116
+ $ lema = '' ;
117
+ if ($ lemaLink ) {
118
+ $ lema = $ this ->_cleanText ($ lemaLink ->nodeValue );
119
+ }
120
+
121
+ // Mengambil link Tesaurus
122
+ $ tesaurusLink = '' ;
123
+ $ tesaurusAnchor = $ xpath ->query (".//p/a[contains(@href, 'tematis/lema')] " , $ h2Element )->item (0 );
124
+ if ($ tesaurusAnchor ) {
125
+ $ tesaurusLink = $ tesaurusAnchor ->getAttribute ('href ' );
126
+ } else {
127
+ $ tesaurusLink = "http://tesaurus.kemdikbud.go.id/tematis/lema/ " .$ word ;
128
+ }
129
+
130
+ // Mengambil deskripsi/arti dari ul/li setelah h2
131
+ $ ulElement = $ xpath ->query ("following-sibling::ul[@class='adjusted-par'][1] " , $ h2Element )->item (0 );
132
+ $ arti = [];
133
+ if ($ ulElement ) {
134
+ $ listItems = $ xpath ->query (".//li " , $ ulElement );
135
+ foreach ($ listItems as $ j => $ listItem ) {
136
+ $ deskripsi = $ this ->_cleanText ($ listItem ->nodeValue );
137
+ $ arti [] = ['deskripsi ' => $ deskripsi ];
138
+ }
139
+ }
140
+
141
+ // Menyimpan data dalam $dataResponse
142
+ if (!empty ($ lema ) && !empty ($ arti )) {
143
+ $ dataResponse [] = [
144
+ 'word ' => $ word ,
145
+ 'lema ' => $ lema . " » " . $ word ,
146
+ 'arti ' => $ arti ,
147
+ 'tesaurusLink ' => $ tesaurusLink ,
148
+ ];
149
+ }
150
+ }
151
+
152
+ return $ dataResponse ;
153
+ }
154
+
155
+ public function searchWord ($ word )
156
+ {
157
+ // Clean the word
158
+ $ cleanWord = $ this ->_cleanWord ($ word );
159
+
160
+ $ htmlData = $ this ->_fetchHtml ($ word );
161
+
162
+ $ dataResponse = [];
163
+
164
+ $ _parserV1 = $ this ->_parserV1 ($ htmlData , $ cleanWord );
165
+ if (count ($ _parserV1 )){
166
+ $ dataResponse = $ _parserV1 ;
167
+ } else {
168
+ $ _parserV2 = $ this ->_parserV2 ($ htmlData , $ cleanWord );
169
+ if (count ($ _parserV2 )){
170
+ $ dataResponse = $ _parserV2 ;
171
+ }
172
+ }
87
173
88
174
return count ($ dataResponse ) ? $ dataResponse : false ;
89
175
}
0 commit comments