Skip to content

Commit ffe5139

Browse files
committed
Add gurufocus scraper
1 parent e28a624 commit ffe5139

File tree

4 files changed

+65
-45
lines changed

4 files changed

+65
-45
lines changed

Trading/config/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@
2929
RATIO_STOCKS_PATH = TMP_PATH.joinpath("ratio_stocks/")
3030
ALERTS_PATH = CURRENT_FILE_PATH.joinpath("../live/alert/alert.json")
3131
CACHING_PATH = TMP_PATH.joinpath("caching")
32+
GURUFOCUS_DOWNLOADS_PATH = TMP_PATH.joinpath("gurufocus")

Trading/stock/gurufocus/cli.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#!/usr/bin/env python3
22

3-
from Trading.stock.gurufocus.gurufocus import download_html, extract_stock_info
3+
from Trading.stock.gurufocus.gurufocus import GurufocusAnalyzer
44
from Trading.utils.google_search import get_first_google_result
5-
from Trading.utils.cli import Named
5+
from Trading.utils.cli import Named, JsonFileWriter
66
from Trading.utils.custom_logging import get_logger
7+
from Trading.config.config import GURUFOCUS_DOWNLOADS_PATH
78
from typing import Optional, List
89
import os
910
import fire
@@ -13,9 +14,13 @@
1314

1415

1516
# DEBUG=true cli.py analyze --names '["pdco", "paypal", "johnson&johnson", "mcdonalds", "pepsi", "uniper", "palantir"]'
16-
class GuruFocusCLI(Named):
17-
def __init__(self, name: Optional[str] = None, names: Optional[List[str]] = None):
17+
class GuruFocusCLI(Named, JsonFileWriter):
18+
def __init__(self, name: Optional[str] = None, names: Optional[List[str]] = None, filename: Optional[str] = None):
1819
Named.__init__(self, name=name, names=names)
20+
if filename is None:
21+
filename = GURUFOCUS_DOWNLOADS_PATH / "gurufocus.json"
22+
JsonFileWriter.__init__(self, filename=filename)
23+
1924
def analyze(self):
2025
if not self.name and not self.names:
2126
LOGGER.error("Name is required")
@@ -27,18 +32,20 @@ def analyze(self):
2732
urls.append(get_first_google_result("gurufocus summary " + name))
2833
LOGGER.debug(f"URLs: {urls}")
2934

30-
for url in urls:
31-
LOGGER.info(f"Scraping {url}")
32-
download_html(url, filename="gurufocus_page.html")
33-
html_file_path = "gurufocus_page.html"
34-
stock_info = extract_stock_info(html_file_path)
35-
if os.path.exists(html_file_path):
36-
os.remove(html_file_path)
37-
if not stock_info:
38-
LOGGER.error(f"Failed to extract stock info from {url}")
39-
continue
40-
LOGGER.info(stock_info)
41-
time.sleep(0.5)
35+
gf_analyzer = GurufocusAnalyzer(self.json_file_writer)
36+
gf_analyzer.run(items=urls, data={})
37+
# for url in urls:
38+
# LOGGER.info(f"Scraping {url}")
39+
# download_html(url, filename="gurufocus_page.html")
40+
# html_file_path = "gurufocus_page.html"
41+
# stock_info = extract_stock_info(html_file_path)
42+
# if os.path.exists(html_file_path):
43+
# os.remove(html_file_path)
44+
# if not stock_info:
45+
# LOGGER.error(f"Failed to extract stock info from {url}")
46+
# continue
47+
# LOGGER.info(stock_info)
48+
# time.sleep(0.1)
4249

4350
if __name__ == "__main__":
4451
fire.Fire(GuruFocusCLI)

Trading/stock/gurufocus/gurufocus.py

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
import requests
21
import re
3-
from bs4 import BeautifulSoup
4-
from dataclasses import dataclass, asdict
2+
from Trading.utils.html import to_beautiful_soup
3+
from stateful_data_processor.processor import StatefulDataProcessor
4+
from pydantic import BaseModel
55

6-
@dataclass
7-
class GurufocusAnalysis:
6+
7+
class GurufocusAnalysis(BaseModel):
88
company_name: str = None
99
ticker: str = None
1010
market_cap: str = None
@@ -14,26 +14,14 @@ class GurufocusAnalysis:
1414
gf_value: float = None
1515
altman_z_score: float = None
1616

17-
def __iter__(self):
18-
return iter(asdict(self).items())
19-
def download_html(url, filename="gurufocus_page.html"):
20-
headers = {"User-Agent": "Mozilla/5.0"}
21-
response = requests.get(url, headers=headers)
22-
response.raise_for_status()
23-
24-
with open(filename, "w", encoding="utf-8") as f:
25-
f.write(response.text)
26-
27-
return filename
28-
29-
3017

3118
MARKET_CAP_REGEX = re.compile(
3219
r"Market Cap\s*[:\-]?\s*\$?\s*([\d\.]+\s*[MBT]?)", re.IGNORECASE
3320
)
34-
def extract_stock_info(html_file_path: str) -> GurufocusAnalysis:
35-
with open(html_file_path, "r", encoding="utf-8") as file:
36-
soup = BeautifulSoup(file, "html.parser")
21+
22+
23+
def extract_stock_info(html: str) -> GurufocusAnalysis:
24+
soup = to_beautiful_soup(html)
3725

3826
data = {
3927
"company_name": None,
@@ -133,11 +121,12 @@ def extract_stock_info(html_file_path: str) -> GurufocusAnalysis:
133121
return GurufocusAnalysis(**data)
134122

135123

136-
# Example usage
137-
# if __name__ == "__main__":
138-
# # url = "https://www.gurufocus.com/stock/GOGL/summary"
139-
# # html_file = download_html(url)
140-
# # print(f"Downloaded HTML to {html_file}")
141-
# html_path = "gurufocus_page.html" # Adjust path if needed
142-
# info = extract_stock_info(html_path)
143-
# print(info)
124+
class GurufocusAnalyzer(StatefulDataProcessor):
125+
def __init__(self, json_file_writer=None, logger=None):
126+
super().__init__(json_file_writer, logger=logger)
127+
self.data = {}
128+
129+
def process_item(self, item, iteration_index, data):
130+
url = item
131+
stock_info = extract_stock_info(url)
132+
self.data[stock_info.ticker] = stock_info.dict()

Trading/utils/html.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
4+
def download_html(url, filename: str):
5+
headers = {"User-Agent": "Mozilla/5.0"}
6+
response = requests.get(url, headers=headers)
7+
response.raise_for_status()
8+
9+
with open(filename, "w", encoding="utf-8") as f:
10+
f.write(response.text)
11+
12+
return filename
13+
14+
def load_html(url) -> str:
15+
headers = {"User-Agent": "Mozilla/5.0"}
16+
response = requests.get(url, headers=headers)
17+
response.raise_for_status()
18+
19+
return response.text
20+
21+
def to_beautiful_soup(url: str) -> BeautifulSoup:
22+
html = load_html(url)
23+
return BeautifulSoup(html, "html.parser")

0 commit comments

Comments
 (0)