@@ -10,13 +10,13 @@ import (
10
10
"time"
11
11
12
12
"github.com/gocolly/colly"
13
- "github.com/gocolly/colly/debug"
14
13
)
15
14
16
15
// constants that do not change
17
16
const (
18
17
username = "dzone-refcardz@mailcatch.com" // created just for this purpose
19
18
password = "password123456" // created just for this purpose
19
+ baseURL = "https://dzone.com/services/widget/assets-listV2/DEFAULT?hidefeat=true&page="
20
20
)
21
21
22
22
// RefcardzData is an exported title and pdf link
@@ -40,120 +40,79 @@ func main() { // main function start
40
40
os .Mkdir ("downloads" , 0700 )
41
41
42
42
// instantiate first collector for log in
43
- f := colly .NewCollector (colly .Debugger (& debug.LogDebugger {}))
44
-
45
- // instantiate second collector for assets-list
46
- g := colly .NewCollector ()
43
+ login := colly .NewCollector ()
47
44
48
45
// null the counters/pagers
49
- page := 0
50
46
counter := 1
47
+ stop := false
51
48
52
49
// authenticate, otherwise you won't be able to download refcardz later
53
- f .OnHTML ("form[role=form] input[type=hidden][name=TH_CSRF]" , func (e * colly.HTMLElement ) {
50
+ login .OnHTML ("form[role=form] input[type=hidden][name=TH_CSRF]" , func (e * colly.HTMLElement ) {
54
51
thCsrf := e .Attr ("value" )
55
- err := f .Post ("https://dzone.com/j_spring_security_check" , map [string ]string {"TH_CSRF" : thCsrf , "_spring_security_remember_me" : "true" , "j_username" : "dzone-refcardz@mailcatch.com" , "j_password" : "password123456" })
52
+ err := login .Post ("https://dzone.com/j_spring_security_check" , map [string ]string {"TH_CSRF" : thCsrf , "_spring_security_remember_me" : "true" , "j_username" : "dzone-refcardz@mailcatch.com" , "j_password" : "password123456" })
56
53
if err != nil {
57
54
log .Fatal (err )
58
55
}
59
56
return
60
57
})
61
58
62
59
// visit the users-login page
63
- f .Visit ("https://dzone.com/users/login.html" )
64
-
65
- // clone the logging collector
66
- h := f .Clone ()
67
- h .SetRequestTimeout (180 * time .Second )
68
-
69
- // keep visiting the assets-list websites until the specific response indicating last page is returned
70
- for {
60
+ login .Visit ("https://dzone.com/users/login.html" )
61
+ fmt .Println ("Logging in..." )
71
62
72
- // is there any error in the response?
73
- g .OnError (func (r * colly.Response , e error ) {
74
- log .Println ("Error:" , e , r .Request .URL , string (r .Body ))
75
- })
76
-
77
- // perform the following block on each assets-page response
78
- g .OnResponse (func (r * colly.Response ) {
63
+ // instantiate second collector for assets-list
64
+ assets := colly .NewCollector ()
79
65
80
- // unmarshal JSON based on the struct
81
- err := json .Unmarshal ([]byte (r .Body ), & data )
82
- if err != nil { // eventually error out
83
- log .Fatal (err )
84
- }
66
+ // clone the logging collector for the actual downloads
67
+ downloader := login .Clone ()
68
+ downloader .SetRequestTimeout (180 * time .Second )
85
69
86
- // when the response is the last page, stop continuing with other pages and exit
87
- if string (r .Body ) == `{"success":true,"result":{"data":{"assets":[],"sort":"downloads"}},"status":200}` {
88
- fmt .Println ("Last page reached" )
89
- defer fmt .Println ("!" )
90
- os .Exit (0 )
91
- }
70
+ // is there any error in the assets-list response?
71
+ assets .OnError (func (r * colly.Response , e error ) {
72
+ log .Println ("Error:" , e , r .Request .URL , string (r .Body ))
73
+ })
92
74
93
- // naked return; returns the current values in the return arguments local variables
94
- //return
75
+ // perform the following block on each assets-list page
76
+ assets . OnResponse ( func ( r * colly. Response ) {
95
77
96
- }) // end of response block
78
+ // unmarshal JSON based on the struct
79
+ err := json .Unmarshal ([]byte (r .Body ), & data )
80
+ if err != nil { // eventually error out
81
+ log .Fatal (err )
82
+ }
97
83
98
84
// loop through the data assets containing Titles and incomplete Pdf links
99
85
for _ , obj := range data .Result .Data .Assets {
100
- fileName := strings .Replace (obj .Title + ".pdf" , " " , "_" , - 1 ) // title is going to be filename
86
+ fileName := strings .Replace (obj .Title + ".pdf" , " " , "_" , - 1 ) // title is going to be filename, eplace the " " with a "_" in the filename and add extension
101
87
link := "https://dzone.com" + obj .Pdf // complete the HTTP link
102
- fmt .Println ("link after assignment:" , link )
103
88
104
- h .OnResponse (func (q * colly.Response ) {
105
- // replace the " " with a "_" in the filename and add extension
106
- q .Save ("downloads/" + fileName ) // save
107
- fmt .Println ("#" , counter , "Downloaded" , fileName , "from" , link ) // show verbose progress
89
+ downloader .OnResponse (func (q * colly.Response ) {
90
+ q .Save ("downloads/" + fileName ) // save in the downloads directory
108
91
})
109
92
110
- // visit
111
- fmt .Println ("link before h.visit:" , link )
112
- h .Visit (link )
93
+ downloader .Visit (link )
94
+ fmt .Println ("#" , counter , "Downloaded" , fileName , "from" , link ) // show verbose progress
113
95
link = ""
96
+ fileName = ""
114
97
counter ++
115
98
}
116
- /*
117
- Debug log:
118
- [000001] 1 [ 1 - request] map["url":"https://dzone.com/users/login.html"] (36.392µs)
119
- [000002] 1 [ 1 - response] map["url":"https://dzone.com/users/login.html" "status":"OK"] (1.535579866s)
120
- [000003] 1 [ 1 - html] map["selector":"form[role=form] input[type=hidden][name=TH_CSRF]" "url":"https://dzone.com/users/login.html"] (1.537939626s)
121
- [000004] 1 [ 2 - request] map["url":"https://dzone.com/j_spring_security_check"] (1.538068512s)
122
- [000005] 1 [ 2 - response] map["url":"https://dzone.com/index.html" "status":"OK"] (2.565529343s)
123
- [000006] 1 [ 2 - scraped] map["url":"https://dzone.com/index.html"] (2.570756573s)
124
- [000007] 1 [ 1 - scraped] map["url":"https://dzone.com/users/login.html"] (2.570774088s)
125
- link after assignment: https://dzone.com/asset/download/280333
126
- link before h.visit: https://dzone.com/asset/download/280333
127
- [000008] 3 [ 1 - request] map["url":"https://dzone.com/asset/download/280333"] (3.377048525s)
128
- [000009] 3 [ 1 - response] map["url":"https://dzone.com/storage/assets/11325551-dzone-refcard288-gettingstartedwithgit0221.pdf" "status":"OK"] (11.466351159s)
129
- # 1 Downloaded Getting_Started_With_Git.pdf from https://dzone.com/asset/download/280333
130
- [000010] 3 [ 1 - scraped] map["url":"https://dzone.com/storage/assets/11325551-dzone-refcard288-gettingstartedwithgit0221.pdf"] (11.468748885s)
131
- link after assignment: https://dzone.com/asset/download/279342
132
- link before h.visit: https://dzone.com/asset/download/279342
133
- [000011] 3 [ 2 - request] map["url":"https://dzone.com/asset/download/279342"] (11.469000995s)
134
- [000012] 3 [ 2 - response] map["status":"OK" "url":"https://dzone.com/storage/assets/11283656-dzone-refcard288-timeseriesdata.pdf"] (1m15.373064768s)
135
- # 2 Downloaded Getting_Started_With_Git.pdf from
136
- # 2 Downloaded Working_With_Time_Series_Data.pdf from https://dzone.com/asset/download/279342
137
- [000013] 3 [ 2 - scraped] map["url":"https://dzone.com/storage/assets/11283656-dzone-refcard288-timeseriesdata.pdf"] (1m15.385235892s)
138
- link after assignment: https://dzone.com/asset/download/278339
139
- link before h.visit: https://dzone.com/asset/download/278339
140
- [000014] 3 [ 3 - request] map["url":"https://dzone.com/asset/download/278339"] (1m15.385308434s)
141
- [000015] 3 [ 3 - response] map["url":"https://dzone.com/storage/assets/11231342-dzone-refcard288-introtolowcode.pdf" "status":"OK"] (1m22.233645502s)
142
- # 3 Downloaded Getting_Started_With_Git.pdf from
143
- # 3 Downloaded Working_With_Time_Series_Data.pdf from
144
- # 3 Downloaded Low-Code_Application_Development.pdf from https://dzone.com/asset/download/278339
145
- [000016] 3 [ 3 - scraped] map["url":"https://dzone.com/storage/assets/11231342-dzone-refcard288-introtolowcode.pdf"] (1m22.257504381s)
146
- link after assignment: https://dzone.com/asset/download/279336
147
- link before h.visit: https://dzone.com/asset/download/279336
148
- [000017] 3 [ 4 - request] map["url":"https://dzone.com/asset/download/279336"] (1m22.25785153s)
149
- ^Csignal: interrupt
150
- */
151
- // next page
152
- page ++
153
-
154
- // visitassets-list website construction
155
- g .Visit ("https://dzone.com/services/widget/assets-listV2/DEFAULT?hidefeat=true&page=" + strconv .Itoa (page ) + "&sort=downloads&type=refcard" )
156
-
157
- } // end of the for loop
99
+
100
+ // when the response is the last page, stop continuing with other pages and exit out
101
+ if string (r .Body ) == `{"success":true,"result":{"data":{"assets":[],"sort":"downloads"}},"status":200}` {
102
+ defer fmt .Println ("Last page reached!" )
103
+ stop = true
104
+ return
105
+ }
106
+
107
+ }) // end of the assets-list response block
108
+
109
+ // visit assets-list until the last page is reached
110
+ for page := 1 ; page < 999 ; page ++ {
111
+ if stop {
112
+ break
113
+ } else {
114
+ assets .Visit (baseURL + strconv .Itoa (page ) + "&sort=downloads&type=refcard" )
115
+ }
116
+ }
158
117
159
118
} // end of the main function
0 commit comments