Skip to content

Commit 8b622e9

Browse files
committed
Removed outer-loop and downloading works
1 parent 9a219c0 commit 8b622e9

File tree

1 file changed

+47
-88
lines changed

1 file changed

+47
-88
lines changed

main.go

Lines changed: 47 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ import (
1010
"time"
1111

1212
"github.com/gocolly/colly"
13-
"github.com/gocolly/colly/debug"
1413
)
1514

1615
// constants that do not change
1716
const (
1817
username = "dzone-refcardz@mailcatch.com" // created just for this purpose
1918
password = "password123456" // created just for this purpose
19+
baseURL = "https://dzone.com/services/widget/assets-listV2/DEFAULT?hidefeat=true&page="
2020
)
2121

2222
// RefcardzData is an exported title and pdf link
@@ -40,120 +40,79 @@ func main() { // main function start
4040
os.Mkdir("downloads", 0700)
4141

4242
// instantiate first collector for log in
43-
f := colly.NewCollector(colly.Debugger(&debug.LogDebugger{}))
44-
45-
// instantiate second collector for assets-list
46-
g := colly.NewCollector()
43+
login := colly.NewCollector()
4744

4845
// null the counters/pagers
49-
page := 0
5046
counter := 1
47+
stop := false
5148

5249
// authenticate, otherwise you won't be able to download refcardz later
53-
f.OnHTML("form[role=form] input[type=hidden][name=TH_CSRF]", func(e *colly.HTMLElement) {
50+
login.OnHTML("form[role=form] input[type=hidden][name=TH_CSRF]", func(e *colly.HTMLElement) {
5451
thCsrf := e.Attr("value")
55-
err := f.Post("https://dzone.com/j_spring_security_check", map[string]string{"TH_CSRF": thCsrf, "_spring_security_remember_me": "true", "j_username": "dzone-refcardz@mailcatch.com", "j_password": "password123456"})
52+
err := login.Post("https://dzone.com/j_spring_security_check", map[string]string{"TH_CSRF": thCsrf, "_spring_security_remember_me": "true", "j_username": "dzone-refcardz@mailcatch.com", "j_password": "password123456"})
5653
if err != nil {
5754
log.Fatal(err)
5855
}
5956
return
6057
})
6158

6259
// visit the users-login page
63-
f.Visit("https://dzone.com/users/login.html")
64-
65-
// clone the logging collector
66-
h := f.Clone()
67-
h.SetRequestTimeout(180 * time.Second)
68-
69-
// keep visiting the assets-list websites until the specific response indicating last page is returned
70-
for {
60+
login.Visit("https://dzone.com/users/login.html")
61+
fmt.Println("Logging in...")
7162

72-
// is there any error in the response?
73-
g.OnError(func(r *colly.Response, e error) {
74-
log.Println("Error:", e, r.Request.URL, string(r.Body))
75-
})
76-
77-
// perform the following block on each assets-page response
78-
g.OnResponse(func(r *colly.Response) {
63+
// instantiate second collector for assets-list
64+
assets := colly.NewCollector()
7965

80-
// unmarshal JSON based on the struct
81-
err := json.Unmarshal([]byte(r.Body), &data)
82-
if err != nil { // eventually error out
83-
log.Fatal(err)
84-
}
66+
// clone the logging collector for the actual downloads
67+
downloader := login.Clone()
68+
downloader.SetRequestTimeout(180 * time.Second)
8569

86-
// when the response is the last page, stop continuing with other pages and exit
87-
if string(r.Body) == `{"success":true,"result":{"data":{"assets":[],"sort":"downloads"}},"status":200}` {
88-
fmt.Println("Last page reached")
89-
defer fmt.Println("!")
90-
os.Exit(0)
91-
}
70+
// is there any error in the assets-list response?
71+
assets.OnError(func(r *colly.Response, e error) {
72+
log.Println("Error:", e, r.Request.URL, string(r.Body))
73+
})
9274

93-
// naked return; returns the current values in the return arguments local variables
94-
//return
75+
// perform the following block on each assets-list page
76+
assets.OnResponse(func(r *colly.Response) {
9577

96-
}) // end of response block
78+
// unmarshal JSON based on the struct
79+
err := json.Unmarshal([]byte(r.Body), &data)
80+
if err != nil { // eventually error out
81+
log.Fatal(err)
82+
}
9783

9884
// loop through the data assets containing Titles and incomplete Pdf links
9985
for _, obj := range data.Result.Data.Assets {
100-
fileName := strings.Replace(obj.Title+".pdf", " ", "_", -1) // title is going to be filename
86+
fileName := strings.Replace(obj.Title+".pdf", " ", "_", -1) // title is going to be filename, eplace the " " with a "_" in the filename and add extension
10187
link := "https://dzone.com" + obj.Pdf // complete the HTTP link
102-
fmt.Println("link after assignment:", link)
10388

104-
h.OnResponse(func(q *colly.Response) {
105-
// replace the " " with a "_" in the filename and add extension
106-
q.Save("downloads/" + fileName) // save
107-
fmt.Println("#", counter, "Downloaded", fileName, "from", link) // show verbose progress
89+
downloader.OnResponse(func(q *colly.Response) {
90+
q.Save("downloads/" + fileName) // save in the downloads directory
10891
})
10992

110-
// visit
111-
fmt.Println("link before h.visit:", link)
112-
h.Visit(link)
93+
downloader.Visit(link)
94+
fmt.Println("#", counter, "Downloaded", fileName, "from", link) // show verbose progress
11395
link = ""
96+
fileName = ""
11497
counter++
11598
}
116-
/*
117-
Debug log:
118-
[000001] 1 [ 1 - request] map["url":"https://dzone.com/users/login.html"] (36.392µs)
119-
[000002] 1 [ 1 - response] map["url":"https://dzone.com/users/login.html" "status":"OK"] (1.535579866s)
120-
[000003] 1 [ 1 - html] map["selector":"form[role=form] input[type=hidden][name=TH_CSRF]" "url":"https://dzone.com/users/login.html"] (1.537939626s)
121-
[000004] 1 [ 2 - request] map["url":"https://dzone.com/j_spring_security_check"] (1.538068512s)
122-
[000005] 1 [ 2 - response] map["url":"https://dzone.com/index.html" "status":"OK"] (2.565529343s)
123-
[000006] 1 [ 2 - scraped] map["url":"https://dzone.com/index.html"] (2.570756573s)
124-
[000007] 1 [ 1 - scraped] map["url":"https://dzone.com/users/login.html"] (2.570774088s)
125-
link after assignment: https://dzone.com/asset/download/280333
126-
link before h.visit: https://dzone.com/asset/download/280333
127-
[000008] 3 [ 1 - request] map["url":"https://dzone.com/asset/download/280333"] (3.377048525s)
128-
[000009] 3 [ 1 - response] map["url":"https://dzone.com/storage/assets/11325551-dzone-refcard288-gettingstartedwithgit0221.pdf" "status":"OK"] (11.466351159s)
129-
# 1 Downloaded Getting_Started_With_Git.pdf from https://dzone.com/asset/download/280333
130-
[000010] 3 [ 1 - scraped] map["url":"https://dzone.com/storage/assets/11325551-dzone-refcard288-gettingstartedwithgit0221.pdf"] (11.468748885s)
131-
link after assignment: https://dzone.com/asset/download/279342
132-
link before h.visit: https://dzone.com/asset/download/279342
133-
[000011] 3 [ 2 - request] map["url":"https://dzone.com/asset/download/279342"] (11.469000995s)
134-
[000012] 3 [ 2 - response] map["status":"OK" "url":"https://dzone.com/storage/assets/11283656-dzone-refcard288-timeseriesdata.pdf"] (1m15.373064768s)
135-
# 2 Downloaded Getting_Started_With_Git.pdf from
136-
# 2 Downloaded Working_With_Time_Series_Data.pdf from https://dzone.com/asset/download/279342
137-
[000013] 3 [ 2 - scraped] map["url":"https://dzone.com/storage/assets/11283656-dzone-refcard288-timeseriesdata.pdf"] (1m15.385235892s)
138-
link after assignment: https://dzone.com/asset/download/278339
139-
link before h.visit: https://dzone.com/asset/download/278339
140-
[000014] 3 [ 3 - request] map["url":"https://dzone.com/asset/download/278339"] (1m15.385308434s)
141-
[000015] 3 [ 3 - response] map["url":"https://dzone.com/storage/assets/11231342-dzone-refcard288-introtolowcode.pdf" "status":"OK"] (1m22.233645502s)
142-
# 3 Downloaded Getting_Started_With_Git.pdf from
143-
# 3 Downloaded Working_With_Time_Series_Data.pdf from
144-
# 3 Downloaded Low-Code_Application_Development.pdf from https://dzone.com/asset/download/278339
145-
[000016] 3 [ 3 - scraped] map["url":"https://dzone.com/storage/assets/11231342-dzone-refcard288-introtolowcode.pdf"] (1m22.257504381s)
146-
link after assignment: https://dzone.com/asset/download/279336
147-
link before h.visit: https://dzone.com/asset/download/279336
148-
[000017] 3 [ 4 - request] map["url":"https://dzone.com/asset/download/279336"] (1m22.25785153s)
149-
^Csignal: interrupt
150-
*/
151-
// next page
152-
page++
153-
154-
// visitassets-list website construction
155-
g.Visit("https://dzone.com/services/widget/assets-listV2/DEFAULT?hidefeat=true&page=" + strconv.Itoa(page) + "&sort=downloads&type=refcard")
156-
157-
} // end of the for loop
99+
100+
// when the response is the last page, stop continuing with other pages and exit out
101+
if string(r.Body) == `{"success":true,"result":{"data":{"assets":[],"sort":"downloads"}},"status":200}` {
102+
defer fmt.Println("Last page reached!")
103+
stop = true
104+
return
105+
}
106+
107+
}) // end of the assets-list response block
108+
109+
// visit assets-list until the last page is reached
110+
for page := 1; page < 999; page++ {
111+
if stop {
112+
break
113+
} else {
114+
assets.Visit(baseURL + strconv.Itoa(page) + "&sort=downloads&type=refcard")
115+
}
116+
}
158117

159118
} // end of the main function

0 commit comments

Comments
 (0)