diff --git a/common_crawl_scraper/README.md b/common_crawl_scraper/README.md new file mode 100644 index 0000000..55185a3 --- /dev/null +++ b/common_crawl_scraper/README.md @@ -0,0 +1,9 @@ +# Download Common Crawl Data + +Use `aws s3 cp` + +For example, + +``` +aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2018-51/segments/1543520000000.0/warc/CC-MAIN-20181205155500-20181205205500-00000.warc.gz ./ +``` diff --git a/common_crawl_scraper/common_crawl_scraper.py b/common_crawl_scraper/common_crawl_scraper.py new file mode 100644 index 0000000..fffa704 --- /dev/null +++ b/common_crawl_scraper/common_crawl_scraper.py @@ -0,0 +1,59 @@ +import os +import string +from concurrent.futures import ThreadPoolExecutor +from glob import glob + +import shortuuid +from bs4 import BeautifulSoup +from warcio.archiveiterator import ArchiveIterator + +output_folder = "large-output-2023-2" +if not os.path.exists(output_folder): + os.makedirs(output_folder) + +cc_main_files = glob("CC-MAIN-*gz") + +invalid_status_code = [401, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 421, 422, 423, 424, 425, 426, 428, 429, 431, 451, 500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511] + +def plain_text_percentage(content): + total_chars = len(content) + if total_chars == 0: + return 0 + text_chars = sum(c in string.printable for c in content) + return (text_chars / total_chars) * 100 + +def process_cc_main_file(cc_main_file): + print("Processing file: " + cc_main_file) + with open(cc_main_file, 'rb') as f: + records = ArchiveIterator(f) + for i, record in enumerate(records): + if record.rec_type == 'response': + url = record.rec_headers.get_header('WARC-Target-URI') + content_type = record.http_headers.get_header('Content-Type') + status_code = int(record.http_headers.get_statuscode()) + if status_code not in invalid_status_code and content_type and ('text/html' in content_type or 'text/css' in content_type or 'text/x-css' in content_type): + if 'map' in url: + continue + file_name = shortuuid.uuid() + file_ext = 'html' if 'text/html' in content_type else 'css' + file_path = os.path.join(output_folder, f"{file_name}.{file_ext}") + + content = record.raw_stream.read() + + if len(content) > 1000: + soup = BeautifulSoup(content, 'html.parser' if 'text/html' in content_type else 'html5lib') + text_content = soup.get_text() + if plain_text_percentage(text_content) > 80: + continue + with open(file_path, 'wb') as out_file: + crawled_address = f"\n".encode() + out_file.write(crawled_address) + out_file.write(content) + + os.remove(cc_main_file) + print(f"Removed file: {cc_main_file}") + +if __name__ == "__main__": + with ThreadPoolExecutor() as executor: + executor.map(process_cc_main_file, cc_main_files) + diff --git a/common_crawl_scraper/css_scraper.py b/common_crawl_scraper/css_scraper.py new file mode 100644 index 0000000..1e0869e --- /dev/null +++ b/common_crawl_scraper/css_scraper.py @@ -0,0 +1,70 @@ +import os +import re +from concurrent.futures import ThreadPoolExecutor + +import chardet +import requests +from bs4 import BeautifulSoup + +input_folder = 'YOUR_INPUT_FOLDER' # change it +output_folder = os.path.join(input_folder, 'downloaded_css') + +# Create the output folder if it doesn't exist +if not os.path.exists(output_folder): + os.makedirs(output_folder) + +# Function to download CSS file +def download_css(url, file_name): + try: + response = requests.get(url) + if response.status_code == 200: + with open(file_name, 'wb') as out_file: + out_file.write(response.content) + print(f"Downloaded: {file_name}") + except: + print(f"Failed to download: {url}") + +# Function to process an HTML file +def process_file(file): + if file.endswith('.html'): + file_path = os.path.join(input_folder, file) + + # Detect file encoding + with open(file_path, 'rb') as raw_file: + result = chardet.detect(raw_file.read()) + file_encoding = result['encoding'] + + # Read the first line for the URL + with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file: + first_line = html_file.readline().strip() + base_url = re.sub(r"", r"\1", first_line).rstrip("/") + + # Read the file with the detected encoding for parsing + with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file: + soup = BeautifulSoup(html_file, 'html.parser') + + # Find all the CSS links + css_links = soup.find_all('link', rel='stylesheet', href=True) + css_urls = [link['href'] for link in css_links] + + # Download the CSS files + for css_url in css_urls: + if not css_url.startswith('http'): + css_url = base_url + css_url + css_name = f"{file}-{css_url.split('/')[-1]}" + output_file_path = os.path.join(output_folder, css_name) + download_css(css_url, output_file_path) + + # Save inline CSS content as a separate file + inline_css_tags = soup.find_all('style', type='text/css') + for idx, inline_css_tag in enumerate(inline_css_tags): + inline_css_content = inline_css_tag.string + if inline_css_content: + css_name = f"{file}-inline-{idx}.css" + output_file_path = os.path.join(output_folder, css_name) + with open(output_file_path, 'w', encoding='utf-8') as inline_css_file: + inline_css_file.write(inline_css_content) + +# Iterate through the HTML files using multithreading +with ThreadPoolExecutor() as executor: + executor.map(process_file, os.listdir(input_folder)) diff --git a/figma_sanitizer/layer_sanitization.py b/figma_sanitizer/layer_sanitization.py new file mode 100644 index 0000000..a23a6e3 --- /dev/null +++ b/figma_sanitizer/layer_sanitization.py @@ -0,0 +1,47 @@ +import concurrent.futures +import glob +import json +import os +import re + +remove_keywords = [ + "Arrow", "Ellipse", "Frame", "Group", "Line", "Polygon", "Rectangle", "Star", "Vector" +] + +remove_keywords_pattern = re.compile('|'.join(remove_keywords), re.IGNORECASE) + + +def remove_nodes_with_keywords(node, pattern): + if 'children' in node: + node['children'] = [ + child for child in node['children'] + if not pattern.search(child['name']) + ] + for child in node['children']: + remove_nodes_with_keywords(child, pattern) + +def process_json_file(json_file, pattern): + with open(json_file, 'r') as f: + json_data = json.load(f) + + remove_nodes_with_keywords(json_data, pattern) + + with open(json_file, 'w') as f: + json.dump(json_data, f, indent=4) + +def main(): + folder_pattern = "../data/samples/figma-samples-5k.min/*" + folders = [folder for folder in glob.glob(folder_pattern) if os.path.isdir(folder)] + json_files = [os.path.join(folder, "file.json") for folder in folders] + + with concurrent.futures.ThreadPoolExecutor() as executor: + results = [executor.submit(process_json_file, json_file, remove_keywords_pattern) for json_file in json_files] + + for future in concurrent.futures.as_completed(results): + try: + future.result() + except Exception as e: + print(f"An error occurred while processing a JSON file: {e}") + +if __name__ == "__main__": + main() diff --git a/figma_sanitizer/meta_extractor.py b/figma_sanitizer/meta_extractor.py new file mode 100644 index 0000000..c24f629 --- /dev/null +++ b/figma_sanitizer/meta_extractor.py @@ -0,0 +1,58 @@ +import json + +from bs4 import BeautifulSoup + + +class JSONData: + def __init__(self, file_path): + with open(file_path, "r") as f: + self.json_data = json.load(f) + + def extract_meaningful_info(self, data): + """ + Extracts meaningful information from JSON data. + + Args: + data (dict): A dictionary containing JSON data. + + Returns: + dict: A dictionary containing the extracted meaningful information. + The dictionary has the following keys: + - id (int): The ID of the data. + - title (str): The title of the data. + - likes (int): The number of likes for the data. + - description (str): The description of the data. + - tags (list): A list of tags associated with the data. + """ + meaningful_info = {} + meaningful_info["id"] = data["id"] + meaningful_info["title"] = data["name"] + meaningful_info["likes"] = data["like_count"] + meaningful_info["description"] = data["description"] + meaningful_info["tags"] = data["tags"] + + return meaningful_info + + def clean_data(self): + cleaned_data = [] + + for data in self.json_data: + meaningful_info = self.extract_meaningful_info(data) + + html_info = meaningful_info["description"] + soup = BeautifulSoup(html_info, "html.parser") + text = soup.get_text(separator=" ") + padded_text = "\n".join([" " + line for line in text.split("\n")]) + + meaningful_info["description"] = padded_text + cleaned_data.append(meaningful_info) + + return cleaned_data + + +if __name__ == "__main__": + data = JSONData("../data/latest/meta.json") + cleaned_data = data.clean_data() + + with open("../data/latest/cleaned_meta.json", "w") as f: + json.dump(cleaned_data, f, indent=4) diff --git a/figma_sanitizer/text_node_saver.py b/figma_sanitizer/text_node_saver.py new file mode 100644 index 0000000..5471155 --- /dev/null +++ b/figma_sanitizer/text_node_saver.py @@ -0,0 +1,75 @@ +import glob +import json +import os +import sqlite3 + + +# Function to find all text nodes in the JSON object +def find_text_nodes(json_object, result): + if isinstance(json_object, dict): + if json_object.get("type") == "TEXT": + result.append(json_object) + for _, value in json_object.items(): + find_text_nodes(value, result) + elif isinstance(json_object, list): + for item in json_object: + find_text_nodes(item, result) + +def main(): + # Create SQLite database and table + conn = sqlite3.connect("text_nodes.db") + cursor = conn.cursor() + + # Delete the table if it already exists + cursor.execute("DROP TABLE IF EXISTS text_nodes") + + # Create the table + cursor.execute("""CREATE TABLE text_nodes ( + id TEXT PRIMARY KEY, + name TEXT, + type TEXT, + json_data TEXT + )""") + + # Loop through matching folders + for folder in glob.glob("../data/samples/figma-samples-5k.min/*"): + + # if not a folder, skip + if not os.path.isdir(folder): + continue + + json_file = os.path.join(folder, "file.json") + + # Extract parent folder name + parent_folder_name = os.path.basename(os.path.dirname(json_file)) + + # Load JSON data + try: + with open(json_file, "r") as file: + data = json.load(file) + except FileNotFoundError: + print(f"The file {json_file} does not exist.") + # Handle the error or continue with the program execution + + + # Find all the text nodes + text_nodes = [] + find_text_nodes(data, text_nodes) + + # Insert text nodes into the database + for node in text_nodes: + # Concatenate parent_folder_name and node's id value + try: + prefixed_id = f"{parent_folder_name}_{node['id']}" + except KeyError as e: + continue + + cursor.execute("""INSERT INTO text_nodes (id, name, type, json_data) + VALUES (?, ?, ?, ?)""", + (prefixed_id, node["name"], node["type"], json.dumps(node))) + + conn.commit() + conn.close() + +if __name__ == "__main__": + main() diff --git a/figma_stats/count_number.py b/figma_stats/count_number.py new file mode 100644 index 0000000..42d03dd --- /dev/null +++ b/figma_stats/count_number.py @@ -0,0 +1,37 @@ +class NumberProcessor: + def __init__(self, filename, output_filename): + self.filename = filename + self.output_filename = output_filename + + def process(self): + with open(self.filename) as f: + content = f.read() + + numbers = content.split(',') + count = len(numbers) + + # Create a dictionary to count the occurrences of each number + counts = {} + for n in numbers: + if n is not None: + try: + n = float(n) + if n not in counts: + counts[n] = 1 + else: + counts[n] += 1 + except ValueError: + pass + + # Sort the counts in descending order based on their frequency + sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True) + + with open(self.output_filename, 'w') as f: + f.write(f"Count of numbers: {count}\n") + f.write("Sorted numbers:\n") + for num, freq in sorted_counts: + f.write(f"{num}: {freq}\n") + +if __name__ == '__main__': + processor = NumberProcessor('artifacts/top-level-frame-size-stat.txt', 'artifacts/stat-width-sorted.txt') + processor.process() diff --git a/figma_stats/stats.py b/figma_stats/stats.py index f9ef0d4..a32bb82 100644 --- a/figma_stats/stats.py +++ b/figma_stats/stats.py @@ -1,17 +1,20 @@ import json -from pathlib import Path +import os import random import re +import sys +import threading +import warnings +from pathlib import Path + import click from tqdm import tqdm -import os -import sys # for easily importing utils current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, current_dir) -from utils import is_text_not_empty, visit, flatten, extract_text +from stats_util import extract_text, flatten, is_text_not_empty, visit @click.command() @@ -39,13 +42,16 @@ def main(samples, max, shuffle): for directory in tqdm(directories, desc='Indexing..', leave=False): # get the text layers' json file json_path = directory / 'file.json' - with open(json_path, 'r') as data: - data = json.load(data) - document = data['document'] - toplayers = flatten([canvas['children'] - for canvas in document['children']]) - - root_elements[directory.name] = toplayers + try: + with open(json_path, 'r') as file: + data = json.load(file) + # print(json.dumps(data, indent=4)) + document = data['document'] + toplayers = [child for canvas in document['children'] for child in canvas['children']] + root_elements[directory.name] = toplayers + except FileNotFoundError as e: + print(f"Error: {e}. Skipping directory {directory}") + print('PRINT:----Indexing done!-------') artifects_dir = Path('artifacts') artifects_dir.mkdir(exist_ok=True) @@ -53,33 +59,52 @@ def main(samples, max, shuffle): # output the text layers' text content into one file with open(artifects_dir / 'texts.txt', 'w') as f: for id in tqdm(ids, desc="Text Layers"): - texts = extract_text(root_elements[id]) - for text in texts: - f.write(text + '\n') + try: + texts = extract_text(root_elements[id]) + for text in texts: + f.write(text + '\n') + except Exception as e: + print(f"Error processing id {id}: {e}") + continue f.close() + print('PRINT:Texts extracted!') # output the layers' name content into one file - with open(artifects_dir / 'layer-names.txt', 'w') as f: + with open(artifects_dir / 'layer-names.txt', 'a') as f: for id in tqdm(ids, desc="Layer Names"): - for layer in visit(root_elements[id], skip_types=['TEXT']): - is_text_not_empty(layer['name']) and f.write( - layer['name'].strip() + '\n') + try: + for layer in visit(root_elements[id], skip_types=['TEXT']): + if is_text_not_empty(layer['name']): + f.write(layer['name'].strip() + '\n') + except KeyError as e: + print(f"Error processing id {id}: key {e} not found") + continue f.close() + print('PRINT:Layer names extracted!') - with open(artifects_dir / 'layer-names-top.txt', 'w') as f: + with open(artifects_dir / 'layer-names-top.txt', 'a') as f: for id in tqdm(ids, desc="Top Layer Names"): - for layer in visit(root_elements[id], skip_types=['TEXT'], max=0): - is_text_not_empty(layer['name']) and f.write( - layer['name'].strip() + '\n') + try: + for layer in visit(root_elements[id], skip_types=['TEXT'], max=0): + if is_text_not_empty(layer['name']): + f.write(layer['name'].strip() + '\n') + except KeyError as e: + print(f"Error processing id {id}: key {e} not found") + continue f.close() + print('Top layer names extracted!') - with open(artifects_dir / 'layer-names-top-frames.txt', 'w') as f: + with open(artifects_dir / 'layer-names-top-frames.txt', 'a') as f: for id in tqdm(ids, desc="Top Layer Names"): - for layer in visit(root_elements[id], visit_types=["FRAME"], max=0): - is_text_not_empty(layer['name']) and f.write( - layer['name'].strip() + '\n') + try: + for layer in visit(root_elements[id], visit_types=["FRAME"], max=0): + if is_text_not_empty(layer['name']): + f.write(layer['name'].strip() + '\n') + except KeyError as e: + print(f"Error processing id {id}: key {e} not found") + continue f.close() - + print('Top layer frames names extracted!') if __name__ == '__main__': diff --git a/figma_stats/stats_util.py b/figma_stats/stats_util.py new file mode 100644 index 0000000..a837fec --- /dev/null +++ b/figma_stats/stats_util.py @@ -0,0 +1,48 @@ + +def is_text_not_empty(text): + return text is not None and len(text.strip()) > 0 + + +def extract_text(layers: list): + """ + loop the layers recursively and extract the text layers' text content + - if there is a 'children' key, call the function again + - if layer type is 'TEXT', return the text content (text#characters) + """ + + texts = [] + + for layer in visit(layers, visit_types=['TEXT']): + if 'type' in layer and layer['type'] == 'TEXT': + texts.append(layer['characters']) + + return texts + + +def visit(layers, skip_types=[], visit_types=None, max=None, depth=0): + if max is not None and depth > max: + return + + # if layers not list, wrap it in a list + if not isinstance(layers, list): + layers = [layers] + + for layer in layers: + if visit_types is not None: + if 'type' in layer and layer['type'] in visit_types: + yield layer + elif 'type' in layer and layer['type'] not in skip_types: + yield layer + + if 'children' in layer: + yield from visit(layer['children'], skip_types=skip_types, visit_types=visit_types, max=max, depth=depth + 1) + + +def flatten(lst): + result = [] + for item in lst: + if isinstance(item, list): + result.extend(flatten(item)) + else: + result.append(item) + return result diff --git a/figma_stats/top_level_frame_size_stat.py b/figma_stats/top_level_frame_size_stat.py new file mode 100644 index 0000000..646bde0 --- /dev/null +++ b/figma_stats/top_level_frame_size_stat.py @@ -0,0 +1,53 @@ +import json +import os + + +def extract_absolute_bounding_box(json_data): + """ + + Extracts the absolute bounding box of all top level frames in a Figma file. + + """ + result = [] + + if json_data['document']['type'] == "DOCUMENT": + for node in json_data['document']['children']: + if node['type'] == "CANVAS": + for child in node['children']: + if child['type'] == "FRAME": + result.append(child['absoluteBoundingBox']) + + return result + +def main(): + """ + + This script will extract the absolute bounding box of all top level frames in a Figma file. + + """ + # Use a relative path from the script's directory + relative_path_folder = "../data/samples/figma-samples-5k.min" + + for folder in os.listdir(relative_path_folder): + folder_path = os.path.join(relative_path_folder, folder) + if os.path.isdir(folder_path): + for file in os.listdir(folder_path): + if file.endswith("file.json"): + print(os.path.join(folder_path, file)) + + try: + with open(os.path.join(folder_path, file), "r") as json_file: + json_data = json.load(json_file) + except: + print(f"Error loading JSON data from {file}") + continue + + absolute_bounding_boxes = extract_absolute_bounding_box(json_data) + + with open("artifacts/top_level_frame_size_stat.txt", "a") as stat_file: + for box in absolute_bounding_boxes: + stat_file.write(str(box["width"]) + ",") + stat_file.write("\n") + +if __name__ == "__main__": + main()