This repository was archived by the owner on Apr 23, 2025. It is now read-only.
File tree Expand file tree Collapse file tree 7 files changed +38
-10
lines changed Expand file tree Collapse file tree 7 files changed +38
-10
lines changed Original file line number Diff line number Diff line change @@ -5,6 +5,7 @@ A web crawler/scraper to find the broken links in the targeted seed url based on
5
5
6
6
##Installation
7
7
1 . Redis
8
+ 3 . Fabric
8
9
2 . Python 2.7+
9
10
10
11
##Instructions
@@ -26,18 +27,26 @@ A web crawler/scraper to find the broken links in the targeted seed url based on
26
27
export SMTP_PASSWORD = ' smtp-password'
27
28
```
28
29
30
+ 4 . Also, set the one more environmnet variable to save ** ` Logs ` ** of the app in defined location.
31
+ ``` python
32
+ # your shell config file
33
+ export LOGS_DIR = ' path/to/logs'
34
+ ```
35
+
29
36
##Commands
37
+ Note:- First install * ` Fabric ` * to run below commands
38
+
30
39
To run a gui app :
31
40
```
32
- $ python rottoscraper/run.py app
41
+ $ fab app
33
42
```
34
43
To run a dispatcher :
35
44
```
36
- $ python rottoscraper/run.py dispatcher
45
+ $ fab dispatcher
37
46
```
38
47
To run a worker :
39
48
```
40
- $ python rotttoscraper/ worker.py
49
+ $ fab worker
41
50
```
42
51
##Developer
43
52
1 . [ Akshay Pratap Singh] ( https://www.facebook.com/AKSHAYPRATAP007 )
Original file line number Diff line number Diff line change
1
+ #! /usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from fabric .api import local
5
+
6
+ def app ():
7
+ local ('python rottoscraper/run.py app' )
8
+
9
+ def dispatcher ():
10
+ local ('python rottoscraper/run.py dispatcher' )
11
+
12
+ def worker ():
13
+ local ('python rottoscraper/worker.py' )
Original file line number Diff line number Diff line change 28
28
# SMTP Cerendentials
29
29
SMTP_USER = os .getenv ('SMTP_USER' , None )
30
30
SMTP_PASSWORD = os .getenv ('SMTP_PASSWORD' , None )
31
+
32
+ # Logs DIR Path
33
+ LOGS_DIR = os .getenv ('LOGS_DIR' , 'logs/' )
Original file line number Diff line number Diff line change 36
36
</ table >
37
37
</ div >
38
38
< div class ="result-content fancy-box ">
39
- < div class ="msg " ng-show =" website.result.length==0 ">
39
+ < div class ="msg " ng-if =" ! website.result ">
40
40
< p > No Rotto Links Page Found</ p >
41
41
</ div >
42
- < div class ="result-content-row " ng-repeat ="rottopage in website.result ">
42
+ < div class ="result-content-row " ng-if =' website.result ' ng-repeat ="rottopage in website.result ">
43
43
44
44
< table class ="table ">
45
45
< tr >
Original file line number Diff line number Diff line change 9
9
from logbook import FileHandler
10
10
from logbook import Logger
11
11
12
+ from config import LOGS_DIR
13
+
12
14
log = Logger ('scraper' )
13
15
14
16
# Create a logs direcory if not exist
15
- if not os .path .exists ('logs' ):
16
- os .makedirs ('logs' )
17
- file_handler = FileHandler ('logs/app.log' , level = logbook .DEBUG )
17
+ if not os .path .exists (LOGS_DIR ):
18
+ os .makedirs (LOGS_DIR )
19
+ log_file_name = 'rottoscraper.log'
20
+ file_handler = FileHandler (LOGS_DIR + log_file_name , level = logbook .DEBUG )
18
21
file_handler .push_application ()
Original file line number Diff line number Diff line change @@ -89,7 +89,7 @@ def search_keywords(self, text=None):
89
89
while trans is None :
90
90
# trans=currentNode.GetTransition(text[index])
91
91
for x in currentNode .transitions :
92
- if unicode ( x .char ) == c :
92
+ if x .char == c :
93
93
trans = x
94
94
if currentNode == self .root :
95
95
break
Original file line number Diff line number Diff line change @@ -87,7 +87,7 @@ def get_plain_text(html):
87
87
Return the plain text in utf-8 encoding from a html
88
88
"""
89
89
raw_text = nltk .clean_html (html )
90
- text = u' ' .join (raw_text .split ()).encode ( 'utf-8' ). lower ()
90
+ text = u' ' .join (raw_text .split ()).lower ()
91
91
return text
92
92
93
93
def get_all_links (html ):
You can’t perform that action at this time.
0 commit comments