diff --git a/common_crawl_scraper/README.md b/common_crawl_scraper/README.md
new file mode 100644
index 0000000..55185a3
--- /dev/null
+++ b/common_crawl_scraper/README.md
@@ -0,0 +1,9 @@
+# Download Common Crawl Data
+
+Use `aws s3 cp`
+
+For example,
+
+```
+aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2018-51/segments/1543520000000.0/warc/CC-MAIN-20181205155500-20181205205500-00000.warc.gz ./
+```
diff --git a/common_crawl_scraper/common_crawl_scraper.py b/common_crawl_scraper/common_crawl_scraper.py
new file mode 100644
index 0000000..fffa704
--- /dev/null
+++ b/common_crawl_scraper/common_crawl_scraper.py
@@ -0,0 +1,59 @@
+import os
+import string
+from concurrent.futures import ThreadPoolExecutor
+from glob import glob
+
+import shortuuid
+from bs4 import BeautifulSoup
+from warcio.archiveiterator import ArchiveIterator
+
+output_folder = "large-output-2023-2"
+if not os.path.exists(output_folder):
+    os.makedirs(output_folder)
+
+cc_main_files = glob("CC-MAIN-*gz")
+
+invalid_status_code = [401, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 421, 422, 423, 424, 425, 426, 428, 429, 431, 451, 500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511]
+
+def plain_text_percentage(content):
+    total_chars = len(content)
+    if total_chars == 0:
+        return 0
+    text_chars = sum(c in string.printable for c in content)
+    return (text_chars / total_chars) * 100
+
+def process_cc_main_file(cc_main_file):
+    print("Processing file: " + cc_main_file)
+    with open(cc_main_file, 'rb') as f:
+        records = ArchiveIterator(f)
+        for i, record in enumerate(records):
+            if record.rec_type == 'response':
+                url = record.rec_headers.get_header('WARC-Target-URI')
+                content_type = record.http_headers.get_header('Content-Type')
+                status_code = int(record.http_headers.get_statuscode())
+                if status_code not in invalid_status_code and content_type and ('text/html' in content_type or 'text/css' in content_type or 'text/x-css' in content_type):
+                    if 'map' in url:
+                        continue
+                    file_name = shortuuid.uuid()
+                    file_ext = 'html' if 'text/html' in content_type else 'css'
+                    file_path = os.path.join(output_folder, f"{file_name}.{file_ext}")
+
+                    content = record.raw_stream.read()
+
+                    if len(content) > 1000:
+                        soup = BeautifulSoup(content, 'html.parser' if 'text/html' in content_type else 'html5lib')
+                        text_content = soup.get_text()
+                        if plain_text_percentage(text_content) > 80:
+                            continue
+                    with open(file_path, 'wb') as out_file:
+                        crawled_address = f"<!-- {url} -->\n".encode()
+                        out_file.write(crawled_address)
+                        out_file.write(content)
+
+    os.remove(cc_main_file)
+    print(f"Removed file: {cc_main_file}")
+
+if __name__ == "__main__":
+    with ThreadPoolExecutor() as executor:
+        executor.map(process_cc_main_file, cc_main_files)
+
diff --git a/common_crawl_scraper/css_scraper.py b/common_crawl_scraper/css_scraper.py
new file mode 100644
index 0000000..1e0869e
--- /dev/null
+++ b/common_crawl_scraper/css_scraper.py
@@ -0,0 +1,70 @@
+import os
+import re
+from concurrent.futures import ThreadPoolExecutor
+
+import chardet
+import requests
+from bs4 import BeautifulSoup
+
+input_folder = 'YOUR_INPUT_FOLDER' # change it
+output_folder = os.path.join(input_folder, 'downloaded_css')
+
+# Create the output folder if it doesn't exist
+if not os.path.exists(output_folder):
+    os.makedirs(output_folder)
+
+# Function to download CSS file
+def download_css(url, file_name):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            with open(file_name, 'wb') as out_file:
+                out_file.write(response.content)
+            print(f"Downloaded: {file_name}")
+    except:
+        print(f"Failed to download: {url}")
+
+# Function to process an HTML file
+def process_file(file):
+    if file.endswith('.html'):
+        file_path = os.path.join(input_folder, file)
+
+        # Detect file encoding
+        with open(file_path, 'rb') as raw_file:
+            result = chardet.detect(raw_file.read())
+        file_encoding = result['encoding']
+
+        # Read the first line for the URL
+        with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file:
+            first_line = html_file.readline().strip()
+            base_url = re.sub(r"<!-- (.*) -->", r"\1", first_line).rstrip("/")
+
+        # Read the file with the detected encoding for parsing
+        with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file:
+            soup = BeautifulSoup(html_file, 'html.parser')
+
+            # Find all the CSS links
+            css_links = soup.find_all('link', rel='stylesheet', href=True)
+            css_urls = [link['href'] for link in css_links]
+
+            # Download the CSS files
+            for css_url in css_urls:
+                if not css_url.startswith('http'):
+                    css_url = base_url + css_url
+                css_name = f"{file}-{css_url.split('/')[-1]}"
+                output_file_path = os.path.join(output_folder, css_name)
+                download_css(css_url, output_file_path)
+
+            # Save inline CSS content as a separate file
+            inline_css_tags = soup.find_all('style', type='text/css')
+            for idx, inline_css_tag in enumerate(inline_css_tags):
+                inline_css_content = inline_css_tag.string
+                if inline_css_content:
+                    css_name = f"{file}-inline-{idx}.css"
+                    output_file_path = os.path.join(output_folder, css_name)
+                    with open(output_file_path, 'w', encoding='utf-8') as inline_css_file:
+                        inline_css_file.write(inline_css_content)
+
+# Iterate through the HTML files using multithreading
+with ThreadPoolExecutor() as executor:
+    executor.map(process_file, os.listdir(input_folder))
diff --git a/figma_sanitizer/layer_sanitization.py b/figma_sanitizer/layer_sanitization.py
new file mode 100644
index 0000000..a23a6e3
--- /dev/null
+++ b/figma_sanitizer/layer_sanitization.py
@@ -0,0 +1,47 @@
+import concurrent.futures
+import glob
+import json
+import os
+import re
+
+remove_keywords = [
+    "Arrow", "Ellipse", "Frame", "Group", "Line", "Polygon", "Rectangle", "Star", "Vector"
+]
+
+remove_keywords_pattern = re.compile('|'.join(remove_keywords), re.IGNORECASE)
+
+
+def remove_nodes_with_keywords(node, pattern):
+    if 'children' in node:
+        node['children'] = [
+            child for child in node['children']
+            if not pattern.search(child['name'])
+        ]
+        for child in node['children']:
+            remove_nodes_with_keywords(child, pattern)
+
+def process_json_file(json_file, pattern):
+    with open(json_file, 'r') as f:
+        json_data = json.load(f)
+
+    remove_nodes_with_keywords(json_data, pattern)
+
+    with open(json_file, 'w') as f:
+        json.dump(json_data, f, indent=4)
+
+def main():
+    folder_pattern = "../data/samples/figma-samples-5k.min/*"
+    folders = [folder for folder in glob.glob(folder_pattern) if os.path.isdir(folder)]
+    json_files = [os.path.join(folder, "file.json") for folder in folders]
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        results = [executor.submit(process_json_file, json_file, remove_keywords_pattern) for json_file in json_files]
+
+        for future in concurrent.futures.as_completed(results):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"An error occurred while processing a JSON file: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/figma_sanitizer/meta_extractor.py b/figma_sanitizer/meta_extractor.py
new file mode 100644
index 0000000..c24f629
--- /dev/null
+++ b/figma_sanitizer/meta_extractor.py
@@ -0,0 +1,58 @@
+import json
+
+from bs4 import BeautifulSoup
+
+
+class JSONData:
+    def __init__(self, file_path):
+        with open(file_path, "r") as f:
+            self.json_data = json.load(f)
+
+    def extract_meaningful_info(self, data):
+        """
+        Extracts meaningful information from JSON data.
+
+        Args:
+            data (dict): A dictionary containing JSON data.
+
+        Returns:
+            dict: A dictionary containing the extracted meaningful information.
+                The dictionary has the following keys:
+                - id (int): The ID of the data.
+                - title (str): The title of the data.
+                - likes (int): The number of likes for the data.
+                - description (str): The description of the data.
+                - tags (list): A list of tags associated with the data.
+        """
+        meaningful_info = {}
+        meaningful_info["id"] = data["id"]
+        meaningful_info["title"] = data["name"]
+        meaningful_info["likes"] = data["like_count"]
+        meaningful_info["description"] = data["description"]
+        meaningful_info["tags"] = data["tags"]
+
+        return meaningful_info
+
+    def clean_data(self):
+        cleaned_data = []
+
+        for data in self.json_data:
+            meaningful_info = self.extract_meaningful_info(data)
+
+            html_info = meaningful_info["description"]
+            soup = BeautifulSoup(html_info, "html.parser")
+            text = soup.get_text(separator=" ")
+            padded_text = "\n".join(["    " + line for line in text.split("\n")])
+
+            meaningful_info["description"] = padded_text
+            cleaned_data.append(meaningful_info)
+
+        return cleaned_data
+
+
+if __name__ == "__main__":
+    data = JSONData("../data/latest/meta.json")
+    cleaned_data = data.clean_data()
+
+    with open("../data/latest/cleaned_meta.json", "w") as f:
+        json.dump(cleaned_data, f, indent=4)
diff --git a/figma_sanitizer/text_node_saver.py b/figma_sanitizer/text_node_saver.py
new file mode 100644
index 0000000..5471155
--- /dev/null
+++ b/figma_sanitizer/text_node_saver.py
@@ -0,0 +1,75 @@
+import glob
+import json
+import os
+import sqlite3
+
+
+# Function to find all text nodes in the JSON object
+def find_text_nodes(json_object, result):
+    if isinstance(json_object, dict):
+        if json_object.get("type") == "TEXT":
+            result.append(json_object)
+        for _, value in json_object.items():
+            find_text_nodes(value, result)
+    elif isinstance(json_object, list):
+        for item in json_object:
+            find_text_nodes(item, result)
+
+def main():
+    # Create SQLite database and table
+    conn = sqlite3.connect("text_nodes.db")
+    cursor = conn.cursor()
+
+    # Delete the table if it already exists
+    cursor.execute("DROP TABLE IF EXISTS text_nodes")
+
+    # Create the table
+    cursor.execute("""CREATE TABLE text_nodes (
+                      id TEXT PRIMARY KEY,
+                      name TEXT,
+                      type TEXT,
+                      json_data TEXT
+                      )""")
+
+    # Loop through matching folders
+    for folder in glob.glob("../data/samples/figma-samples-5k.min/*"):
+
+        # if not a folder, skip
+        if not os.path.isdir(folder):
+            continue
+
+        json_file = os.path.join(folder, "file.json")
+
+        # Extract parent folder name
+        parent_folder_name = os.path.basename(os.path.dirname(json_file))
+
+        # Load JSON data
+        try:
+            with open(json_file, "r") as file:
+                data = json.load(file)
+        except FileNotFoundError:
+            print(f"The file {json_file} does not exist.")
+            # Handle the error or continue with the program execution
+
+
+        # Find all the text nodes
+        text_nodes = []
+        find_text_nodes(data, text_nodes)
+
+        # Insert text nodes into the database
+        for node in text_nodes:
+            # Concatenate parent_folder_name and node's id value
+            try:
+                prefixed_id = f"{parent_folder_name}_{node['id']}"
+            except KeyError as e:
+                continue
+
+            cursor.execute("""INSERT INTO text_nodes (id, name, type, json_data)
+                              VALUES (?, ?, ?, ?)""",
+                           (prefixed_id, node["name"], node["type"], json.dumps(node)))
+
+    conn.commit()
+    conn.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/figma_stats/count_number.py b/figma_stats/count_number.py
new file mode 100644
index 0000000..42d03dd
--- /dev/null
+++ b/figma_stats/count_number.py
@@ -0,0 +1,37 @@
+class NumberProcessor:
+    def __init__(self, filename, output_filename):
+        self.filename = filename
+        self.output_filename = output_filename
+
+    def process(self):
+        with open(self.filename) as f:
+            content = f.read()
+
+        numbers = content.split(',')
+        count = len(numbers)
+
+        # Create a dictionary to count the occurrences of each number
+        counts = {}
+        for n in numbers:
+            if n is not None:
+                try:
+                    n = float(n)
+                    if n not in counts:
+                        counts[n] = 1
+                    else:
+                        counts[n] += 1
+                except ValueError:
+                    pass
+
+        # Sort the counts in descending order based on their frequency
+        sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
+
+        with open(self.output_filename, 'w') as f:
+            f.write(f"Count of numbers: {count}\n")
+            f.write("Sorted numbers:\n")
+            for num, freq in sorted_counts:
+                f.write(f"{num}: {freq}\n")
+
+if __name__ == '__main__':
+    processor = NumberProcessor('artifacts/top-level-frame-size-stat.txt', 'artifacts/stat-width-sorted.txt')
+    processor.process()
diff --git a/figma_stats/stats.py b/figma_stats/stats.py
index f9ef0d4..a32bb82 100644
--- a/figma_stats/stats.py
+++ b/figma_stats/stats.py
@@ -1,17 +1,20 @@
 import json
-from pathlib import Path
+import os
 import random
 import re
+import sys
+import threading
+import warnings
+from pathlib import Path
+
 import click
 from tqdm import tqdm
-import os
-import sys
 
 # for easily importing utils
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, current_dir)
 
-from utils import is_text_not_empty, visit, flatten, extract_text
+from stats_util import extract_text, flatten, is_text_not_empty, visit
 
 
 @click.command()
@@ -39,13 +42,16 @@ def main(samples, max, shuffle):
     for directory in tqdm(directories, desc='Indexing..', leave=False):
         # get the text layers' json file
         json_path = directory / 'file.json'
-        with open(json_path, 'r') as data:
-            data = json.load(data)
-        document = data['document']
-        toplayers = flatten([canvas['children']
-                             for canvas in document['children']])
-
-        root_elements[directory.name] = toplayers
+        try:
+            with open(json_path, 'r') as file:
+                data = json.load(file)
+                # print(json.dumps(data, indent=4))
+            document = data['document']
+            toplayers = [child for canvas in document['children'] for child in canvas['children']]
+            root_elements[directory.name] = toplayers
+        except FileNotFoundError as e:
+            print(f"Error: {e}. Skipping directory {directory}")
+    print('PRINT:----Indexing done!-------')
 
     artifects_dir = Path('artifacts')
     artifects_dir.mkdir(exist_ok=True)
@@ -53,33 +59,52 @@ def main(samples, max, shuffle):
     # output the text layers' text content into one file
     with open(artifects_dir / 'texts.txt', 'w') as f:
         for id in tqdm(ids, desc="Text Layers"):
-            texts = extract_text(root_elements[id])
-            for text in texts:
-                f.write(text + '\n')
+            try:
+                texts = extract_text(root_elements[id])
+                for text in texts:
+                    f.write(text + '\n')
+            except Exception as e:
+                print(f"Error processing id {id}: {e}")
+                continue
         f.close()
+    print('PRINT:Texts extracted!')
 
     # output the layers' name content into one file
-    with open(artifects_dir / 'layer-names.txt', 'w') as f:
+    with open(artifects_dir / 'layer-names.txt', 'a') as f:
         for id in tqdm(ids, desc="Layer Names"):
-            for layer in visit(root_elements[id], skip_types=['TEXT']):
-                is_text_not_empty(layer['name']) and f.write(
-                    layer['name'].strip() + '\n')
+            try:
+                for layer in visit(root_elements[id], skip_types=['TEXT']):
+                    if is_text_not_empty(layer['name']):
+                        f.write(layer['name'].strip() + '\n')
+            except KeyError as e:
+                print(f"Error processing id {id}: key {e} not found")
+                continue
         f.close()
+    print('PRINT:Layer names extracted!')
 
-    with open(artifects_dir / 'layer-names-top.txt', 'w') as f:
+    with open(artifects_dir / 'layer-names-top.txt', 'a') as f:
         for id in tqdm(ids, desc="Top Layer Names"):
-            for layer in visit(root_elements[id], skip_types=['TEXT'], max=0):
-                is_text_not_empty(layer['name']) and f.write(
-                    layer['name'].strip() + '\n')
+            try:
+                for layer in visit(root_elements[id], skip_types=['TEXT'], max=0):
+                    if is_text_not_empty(layer['name']):
+                        f.write(layer['name'].strip() + '\n')
+            except KeyError as e:
+                print(f"Error processing id {id}: key {e} not found")
+                continue
         f.close()
+    print('Top layer names extracted!')
 
-    with open(artifects_dir / 'layer-names-top-frames.txt', 'w') as f:
+    with open(artifects_dir / 'layer-names-top-frames.txt', 'a') as f:
         for id in tqdm(ids, desc="Top Layer Names"):
-            for layer in visit(root_elements[id], visit_types=["FRAME"], max=0):
-                is_text_not_empty(layer['name']) and f.write(
-                    layer['name'].strip() + '\n')
+            try:
+                for layer in visit(root_elements[id], visit_types=["FRAME"], max=0):
+                    if is_text_not_empty(layer['name']):
+                        f.write(layer['name'].strip() + '\n')
+            except KeyError as e:
+                print(f"Error processing id {id}: key {e} not found")
+                continue
         f.close()
-
+    print('Top layer frames names extracted!')
 
 
 if __name__ == '__main__':
diff --git a/figma_stats/stats_util.py b/figma_stats/stats_util.py
new file mode 100644
index 0000000..a837fec
--- /dev/null
+++ b/figma_stats/stats_util.py
@@ -0,0 +1,48 @@
+
+def is_text_not_empty(text):
+    return text is not None and len(text.strip()) > 0
+
+
+def extract_text(layers: list):
+    """
+    loop the layers recursively and extract the text layers' text content
+    - if there is a 'children' key, call the function again
+    - if layer type is 'TEXT', return the text content (text#characters)
+    """
+
+    texts = []
+
+    for layer in visit(layers, visit_types=['TEXT']):
+        if 'type' in layer and layer['type'] == 'TEXT':
+            texts.append(layer['characters'])
+
+    return texts
+
+
+def visit(layers, skip_types=[], visit_types=None, max=None, depth=0):
+    if max is not None and depth > max:
+        return
+
+    # if layers not list, wrap it in a list
+    if not isinstance(layers, list):
+        layers = [layers]
+
+    for layer in layers:
+        if visit_types is not None:
+            if 'type' in layer and layer['type'] in visit_types:
+                yield layer
+        elif 'type' in layer and layer['type'] not in skip_types:
+            yield layer
+
+        if 'children' in layer:
+            yield from visit(layer['children'], skip_types=skip_types, visit_types=visit_types, max=max, depth=depth + 1)
+
+
+def flatten(lst):
+    result = []
+    for item in lst:
+        if isinstance(item, list):
+            result.extend(flatten(item))
+        else:
+            result.append(item)
+    return result
diff --git a/figma_stats/top_level_frame_size_stat.py b/figma_stats/top_level_frame_size_stat.py
new file mode 100644
index 0000000..646bde0
--- /dev/null
+++ b/figma_stats/top_level_frame_size_stat.py
@@ -0,0 +1,53 @@
+import json
+import os
+
+
+def extract_absolute_bounding_box(json_data):
+    """
+
+    Extracts the absolute bounding box of all top level frames in a Figma file.
+
+    """
+    result = []
+
+    if json_data['document']['type'] == "DOCUMENT":
+        for node in json_data['document']['children']:
+            if node['type'] == "CANVAS":
+                for child in node['children']:
+                    if child['type'] == "FRAME":
+                        result.append(child['absoluteBoundingBox'])
+
+    return result
+
+def main():
+    """
+
+    This script will extract the absolute bounding box of all top level frames in a Figma file.
+
+    """
+    # Use a relative path from the script's directory
+    relative_path_folder = "../data/samples/figma-samples-5k.min"
+
+    for folder in os.listdir(relative_path_folder):
+        folder_path = os.path.join(relative_path_folder, folder)
+        if os.path.isdir(folder_path):
+            for file in os.listdir(folder_path):
+                if file.endswith("file.json"):
+                    print(os.path.join(folder_path, file))
+
+                    try:
+                        with open(os.path.join(folder_path, file), "r") as json_file:
+                            json_data = json.load(json_file)
+                    except:
+                        print(f"Error loading JSON data from {file}")
+                        continue
+
+                    absolute_bounding_boxes = extract_absolute_bounding_box(json_data)
+
+                    with open("artifacts/top_level_frame_size_stat.txt", "a") as stat_file:
+                        for box in absolute_bounding_boxes:
+                            stat_file.write(str(box["width"]) + ",")
+                        stat_file.write("\n")
+
+if __name__ == "__main__":
+    main()