gridaco · mindgitrwx · Mar 28, 2023 · Mar 28, 2023 · Mar 30, 2023 · Mar 30, 2023
diff --git a/common_crawl_scraper/README.md b/common_crawl_scraper/README.md
@@ -0,0 +1,9 @@
+# Download Common Crawl Data
+
+Use `aws s3 cp`
+
+For example,
+
+```
+aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2018-51/segments/1543520000000.0/warc/CC-MAIN-20181205155500-20181205205500-00000.warc.gz ./
+```
diff --git a/common_crawl_scraper/common_crawl_scraper.py b/common_crawl_scraper/common_crawl_scraper.py
@@ -0,0 +1,59 @@
+import os
+import string
+from concurrent.futures import ThreadPoolExecutor
+from glob import glob
+
+import shortuuid
+from bs4 import BeautifulSoup
+from warcio.archiveiterator import ArchiveIterator
+
+output_folder = "large-output-2023-2"
+if not os.path.exists(output_folder):
+    os.makedirs(output_folder)
+
+cc_main_files = glob("CC-MAIN-*gz")
+
+invalid_status_code = [401, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 421, 422, 423, 424, 425, 426, 428, 429, 431, 451, 500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511]
+
+def plain_text_percentage(content):
+    total_chars = len(content)
+    if total_chars == 0:
+        return 0
+    text_chars = sum(c in string.printable for c in content)
+    return (text_chars / total_chars) * 100
+
+def process_cc_main_file(cc_main_file):
+    print("Processing file: " + cc_main_file)
+    with open(cc_main_file, 'rb') as f:
+        records = ArchiveIterator(f)
+        for i, record in enumerate(records):
+            if record.rec_type == 'response':
+                url = record.rec_headers.get_header('WARC-Target-URI')
+                content_type = record.http_headers.get_header('Content-Type')
+                status_code = int(record.http_headers.get_statuscode())
+                if status_code not in invalid_status_code and content_type and ('text/html' in content_type or 'text/css' in content_type or 'text/x-css' in content_type):
+                    if 'map' in url:
+                        continue
+                    file_name = shortuuid.uuid()
+                    file_ext = 'html' if 'text/html' in content_type else 'css'
+                    file_path = os.path.join(output_folder, f"{file_name}.{file_ext}")
+
+                    content = record.raw_stream.read()
+
+                    if len(content) > 1000:
+                        soup = BeautifulSoup(content, 'html.parser' if 'text/html' in content_type else 'html5lib')
+                        text_content = soup.get_text()
+                        if plain_text_percentage(text_content) > 80:
+                            continue
+                    with open(file_path, 'wb') as out_file:
+                        crawled_address = f"<!-- {url} -->\n".encode()
+                        out_file.write(crawled_address)
+                        out_file.write(content)
+
+    os.remove(cc_main_file)
+    print(f"Removed file: {cc_main_file}")
+
+if __name__ == "__main__":
+    with ThreadPoolExecutor() as executor:
+        executor.map(process_cc_main_file, cc_main_files)
+
diff --git a/common_crawl_scraper/css_scraper.py b/common_crawl_scraper/css_scraper.py
@@ -0,0 +1,70 @@
+import os
+import re
+from concurrent.futures import ThreadPoolExecutor
+
+import chardet
+import requests
+from bs4 import BeautifulSoup
+
+input_folder = 'YOUR_INPUT_FOLDER' # change it
+output_folder = os.path.join(input_folder, 'downloaded_css')
+
+# Create the output folder if it doesn't exist
+if not os.path.exists(output_folder):
+    os.makedirs(output_folder)
+
+# Function to download CSS file
+def download_css(url, file_name):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            with open(file_name, 'wb') as out_file:
+                out_file.write(response.content)
+            print(f"Downloaded: {file_name}")
+    except:
+        print(f"Failed to download: {url}")
+
+# Function to process an HTML file
+def process_file(file):
+    if file.endswith('.html'):
+        file_path = os.path.join(input_folder, file)
+
+        # Detect file encoding
+        with open(file_path, 'rb') as raw_file:
+            result = chardet.detect(raw_file.read())
+        file_encoding = result['encoding']
+
+        # Read the first line for the URL
+        with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file:
+            first_line = html_file.readline().strip()
+            base_url = re.sub(r"<!-- (.*) -->", r"\1", first_line).rstrip("/")
+
+        # Read the file with the detected encoding for parsing
+        with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file:
+            soup = BeautifulSoup(html_file, 'html.parser')
+
+            # Find all the CSS links
+            css_links = soup.find_all('link', rel='stylesheet', href=True)
+            css_urls = [link['href'] for link in css_links]
+
+            # Download the CSS files
+            for css_url in css_urls:
+                if not css_url.startswith('http'):
+                    css_url = base_url + css_url
+                css_name = f"{file}-{css_url.split('/')[-1]}"
+                output_file_path = os.path.join(output_folder, css_name)
+                download_css(css_url, output_file_path)
+
+            # Save inline CSS content as a separate file
+            inline_css_tags = soup.find_all('style', type='text/css')
+            for idx, inline_css_tag in enumerate(inline_css_tags):
+                inline_css_content = inline_css_tag.string
+                if inline_css_content:
+                    css_name = f"{file}-inline-{idx}.css"
+                    output_file_path = os.path.join(output_folder, css_name)
+                    with open(output_file_path, 'w', encoding='utf-8') as inline_css_file:
+                        inline_css_file.write(inline_css_content)
+
+# Iterate through the HTML files using multithreading
+with ThreadPoolExecutor() as executor:
+    executor.map(process_file, os.listdir(input_folder))
diff --git a/figma_sanitizer/layer_sanitization.py b/figma_sanitizer/layer_sanitization.py
@@ -0,0 +1,47 @@
+import concurrent.futures
+import glob
+import json
+import os
+import re
+
+remove_keywords = [
+    "Arrow", "Ellipse", "Frame", "Group", "Line", "Polygon", "Rectangle", "Star", "Vector"
+]
+
+remove_keywords_pattern = re.compile('|'.join(remove_keywords), re.IGNORECASE)
+
+
+def remove_nodes_with_keywords(node, pattern):
+    if 'children' in node:
+        node['children'] = [
+            child for child in node['children']
+            if not pattern.search(child['name'])
+        ]
+        for child in node['children']:
+            remove_nodes_with_keywords(child, pattern)
+
+def process_json_file(json_file, pattern):
+    with open(json_file, 'r') as f:
+        json_data = json.load(f)
+
+    remove_nodes_with_keywords(json_data, pattern)
+
+    with open(json_file, 'w') as f:
+        json.dump(json_data, f, indent=4)
+
+def main():
+    folder_pattern = "../data/samples/figma-samples-5k.min/*"
+    folders = [folder for folder in glob.glob(folder_pattern) if os.path.isdir(folder)]
+    json_files = [os.path.join(folder, "file.json") for folder in folders]
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        results = [executor.submit(process_json_file, json_file, remove_keywords_pattern) for json_file in json_files]
+
+        for future in concurrent.futures.as_completed(results):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"An error occurred while processing a JSON file: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/figma_sanitizer/meta_extractor.py b/figma_sanitizer/meta_extractor.py
@@ -0,0 +1,58 @@
+import json
+
+from bs4 import BeautifulSoup
+
+
+class JSONData:
+    def __init__(self, file_path):
+        with open(file_path, "r") as f:
+            self.json_data = json.load(f)
+
+    def extract_meaningful_info(self, data):
+        """
+        Extracts meaningful information from JSON data.
+
+        Args:
+            data (dict): A dictionary containing JSON data.
+
+        Returns:
+            dict: A dictionary containing the extracted meaningful information.
+                The dictionary has the following keys:
+                - id (int): The ID of the data.
+                - title (str): The title of the data.
+                - likes (int): The number of likes for the data.
+                - description (str): The description of the data.
+                - tags (list): A list of tags associated with the data.
+        """
+        meaningful_info = {}
+        meaningful_info["id"] = data["id"]
+        meaningful_info["title"] = data["name"]
+        meaningful_info["likes"] = data["like_count"]
+        meaningful_info["description"] = data["description"]
+        meaningful_info["tags"] = data["tags"]
+
+        return meaningful_info
+
+    def clean_data(self):
+        cleaned_data = []
+
+        for data in self.json_data:
+            meaningful_info = self.extract_meaningful_info(data)
+
+            html_info = meaningful_info["description"]
+            soup = BeautifulSoup(html_info, "html.parser")
+            text = soup.get_text(separator=" ")
+            padded_text = "\n".join(["    " + line for line in text.split("\n")])
+
+            meaningful_info["description"] = padded_text
+            cleaned_data.append(meaningful_info)
+
+        return cleaned_data
+
+
+if __name__ == "__main__":
+    data = JSONData("../data/latest/meta.json")
+    cleaned_data = data.clean_data()
+
+    with open("../data/latest/cleaned_meta.json", "w") as f:
+        json.dump(cleaned_data, f, indent=4)
diff --git a/figma_sanitizer/text_node_saver.py b/figma_sanitizer/text_node_saver.py
@@ -0,0 +1,75 @@
+import glob
+import json
+import os
+import sqlite3
+
+
+# Function to find all text nodes in the JSON object
+def find_text_nodes(json_object, result):
+    if isinstance(json_object, dict):
+        if json_object.get("type") == "TEXT":
+            result.append(json_object)
+        for _, value in json_object.items():
+            find_text_nodes(value, result)
+    elif isinstance(json_object, list):
+        for item in json_object:
+            find_text_nodes(item, result)
+
+def main():
+    # Create SQLite database and table
+    conn = sqlite3.connect("text_nodes.db")
+    cursor = conn.cursor()
+
+    # Delete the table if it already exists
+    cursor.execute("DROP TABLE IF EXISTS text_nodes")
+
+    # Create the table
+    cursor.execute("""CREATE TABLE text_nodes (
+                      id TEXT PRIMARY KEY,
+                      name TEXT,
+                      type TEXT,
+                      json_data TEXT
+                      )""")
+
+    # Loop through matching folders
+    for folder in glob.glob("../data/samples/figma-samples-5k.min/*"):
+
+        # if not a folder, skip
+        if not os.path.isdir(folder):
+            continue
+
+        json_file = os.path.join(folder, "file.json")
+
+        # Extract parent folder name
+        parent_folder_name = os.path.basename(os.path.dirname(json_file))
+
+        # Load JSON data
+        try:
+            with open(json_file, "r") as file:
+                data = json.load(file)
+        except FileNotFoundError:
+            print(f"The file {json_file} does not exist.")
+            # Handle the error or continue with the program execution
+
+
+        # Find all the text nodes
+        text_nodes = []
+        find_text_nodes(data, text_nodes)
+
+        # Insert text nodes into the database
+        for node in text_nodes:
+            # Concatenate parent_folder_name and node's id value
+            try:
+                prefixed_id = f"{parent_folder_name}_{node['id']}"
+            except KeyError as e:
+                continue
+
+            cursor.execute("""INSERT INTO text_nodes (id, name, type, json_data)
+                              VALUES (?, ?, ?, ?)""",
+                           (prefixed_id, node["name"], node["type"], json.dumps(node)))
+
+    conn.commit()
+    conn.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/figma_stats/count_number.py b/figma_stats/count_number.py
@@ -0,0 +1,37 @@
+class NumberProcessor:
+    def __init__(self, filename, output_filename):
+        self.filename = filename
+        self.output_filename = output_filename
+
+    def process(self):
+        with open(self.filename) as f:
+            content = f.read()
+
+        numbers = content.split(',')
+        count = len(numbers)
+
+        # Create a dictionary to count the occurrences of each number
+        counts = {}
+        for n in numbers:
+            if n is not None:
+                try:
+                    n = float(n)
+                    if n not in counts:
+                        counts[n] = 1
+                    else:
+                        counts[n] += 1
+                except ValueError:
+                    pass
+
+        # Sort the counts in descending order based on their frequency
+        sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
+
+        with open(self.output_filename, 'w') as f:
+            f.write(f"Count of numbers: {count}\n")
+            f.write("Sorted numbers:\n")
+            for num, freq in sorted_counts:
+                f.write(f"{num}: {freq}\n")
+
+if __name__ == '__main__':
+    processor = NumberProcessor('artifacts/top-level-frame-size-stat.txt', 'artifacts/stat-width-sorted.txt')
+    processor.process()