Skip to content
9 changes: 9 additions & 0 deletions common_crawl_scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Download Common Crawl Data

Use `aws s3 cp`

For example,

```
aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2018-51/segments/1543520000000.0/warc/CC-MAIN-20181205155500-20181205205500-00000.warc.gz ./
```
59 changes: 59 additions & 0 deletions common_crawl_scraper/common_crawl_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import string
from concurrent.futures import ThreadPoolExecutor
from glob import glob

import shortuuid
from bs4 import BeautifulSoup
from warcio.archiveiterator import ArchiveIterator

output_folder = "large-output-2023-2"
if not os.path.exists(output_folder):
os.makedirs(output_folder)

cc_main_files = glob("CC-MAIN-*gz")

invalid_status_code = [401, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 421, 422, 423, 424, 425, 426, 428, 429, 431, 451, 500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511]

def plain_text_percentage(content):
total_chars = len(content)
if total_chars == 0:
return 0
text_chars = sum(c in string.printable for c in content)
return (text_chars / total_chars) * 100

def process_cc_main_file(cc_main_file):
print("Processing file: " + cc_main_file)
with open(cc_main_file, 'rb') as f:
records = ArchiveIterator(f)
for i, record in enumerate(records):
if record.rec_type == 'response':
url = record.rec_headers.get_header('WARC-Target-URI')
content_type = record.http_headers.get_header('Content-Type')
status_code = int(record.http_headers.get_statuscode())
if status_code not in invalid_status_code and content_type and ('text/html' in content_type or 'text/css' in content_type or 'text/x-css' in content_type):
if 'map' in url:
continue
file_name = shortuuid.uuid()
file_ext = 'html' if 'text/html' in content_type else 'css'
file_path = os.path.join(output_folder, f"{file_name}.{file_ext}")

content = record.raw_stream.read()

if len(content) > 1000:
soup = BeautifulSoup(content, 'html.parser' if 'text/html' in content_type else 'html5lib')
text_content = soup.get_text()
if plain_text_percentage(text_content) > 80:
continue
with open(file_path, 'wb') as out_file:
crawled_address = f"<!-- {url} -->\n".encode()
out_file.write(crawled_address)
out_file.write(content)

os.remove(cc_main_file)
print(f"Removed file: {cc_main_file}")

if __name__ == "__main__":
with ThreadPoolExecutor() as executor:
executor.map(process_cc_main_file, cc_main_files)

70 changes: 70 additions & 0 deletions common_crawl_scraper/css_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import re
from concurrent.futures import ThreadPoolExecutor

import chardet
import requests
from bs4 import BeautifulSoup

input_folder = 'YOUR_INPUT_FOLDER' # change it
output_folder = os.path.join(input_folder, 'downloaded_css')

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)

# Function to download CSS file
def download_css(url, file_name):
try:
response = requests.get(url)
if response.status_code == 200:
with open(file_name, 'wb') as out_file:
out_file.write(response.content)
print(f"Downloaded: {file_name}")
except:
print(f"Failed to download: {url}")

# Function to process an HTML file
def process_file(file):
if file.endswith('.html'):
file_path = os.path.join(input_folder, file)

# Detect file encoding
with open(file_path, 'rb') as raw_file:
result = chardet.detect(raw_file.read())
file_encoding = result['encoding']

# Read the first line for the URL
with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file:
first_line = html_file.readline().strip()
base_url = re.sub(r"<!-- (.*) -->", r"\1", first_line).rstrip("/")

# Read the file with the detected encoding for parsing
with open(file_path, 'r', encoding=file_encoding, errors='ignore') as html_file:
soup = BeautifulSoup(html_file, 'html.parser')

# Find all the CSS links
css_links = soup.find_all('link', rel='stylesheet', href=True)
css_urls = [link['href'] for link in css_links]

# Download the CSS files
for css_url in css_urls:
if not css_url.startswith('http'):
css_url = base_url + css_url
css_name = f"{file}-{css_url.split('/')[-1]}"
output_file_path = os.path.join(output_folder, css_name)
download_css(css_url, output_file_path)

# Save inline CSS content as a separate file
inline_css_tags = soup.find_all('style', type='text/css')
for idx, inline_css_tag in enumerate(inline_css_tags):
inline_css_content = inline_css_tag.string
if inline_css_content:
css_name = f"{file}-inline-{idx}.css"
output_file_path = os.path.join(output_folder, css_name)
with open(output_file_path, 'w', encoding='utf-8') as inline_css_file:
inline_css_file.write(inline_css_content)

# Iterate through the HTML files using multithreading
with ThreadPoolExecutor() as executor:
executor.map(process_file, os.listdir(input_folder))
47 changes: 47 additions & 0 deletions figma_sanitizer/layer_sanitization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import concurrent.futures
import glob
import json
import os
import re

remove_keywords = [
"Arrow", "Ellipse", "Frame", "Group", "Line", "Polygon", "Rectangle", "Star", "Vector"
]

remove_keywords_pattern = re.compile('|'.join(remove_keywords), re.IGNORECASE)


def remove_nodes_with_keywords(node, pattern):
if 'children' in node:
node['children'] = [
child for child in node['children']
if not pattern.search(child['name'])
]
for child in node['children']:
remove_nodes_with_keywords(child, pattern)

def process_json_file(json_file, pattern):
with open(json_file, 'r') as f:
json_data = json.load(f)

remove_nodes_with_keywords(json_data, pattern)

with open(json_file, 'w') as f:
json.dump(json_data, f, indent=4)

def main():
folder_pattern = "../data/samples/figma-samples-5k.min/*"
folders = [folder for folder in glob.glob(folder_pattern) if os.path.isdir(folder)]
json_files = [os.path.join(folder, "file.json") for folder in folders]

with concurrent.futures.ThreadPoolExecutor() as executor:
results = [executor.submit(process_json_file, json_file, remove_keywords_pattern) for json_file in json_files]

for future in concurrent.futures.as_completed(results):
try:
future.result()
except Exception as e:
print(f"An error occurred while processing a JSON file: {e}")

if __name__ == "__main__":
main()
58 changes: 58 additions & 0 deletions figma_sanitizer/meta_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json

from bs4 import BeautifulSoup


class JSONData:
def __init__(self, file_path):
with open(file_path, "r") as f:
self.json_data = json.load(f)

def extract_meaningful_info(self, data):
"""
Extracts meaningful information from JSON data.

Args:
data (dict): A dictionary containing JSON data.

Returns:
dict: A dictionary containing the extracted meaningful information.
The dictionary has the following keys:
- id (int): The ID of the data.
- title (str): The title of the data.
- likes (int): The number of likes for the data.
- description (str): The description of the data.
- tags (list): A list of tags associated with the data.
"""
meaningful_info = {}
meaningful_info["id"] = data["id"]
meaningful_info["title"] = data["name"]
meaningful_info["likes"] = data["like_count"]
meaningful_info["description"] = data["description"]
meaningful_info["tags"] = data["tags"]

return meaningful_info

def clean_data(self):
cleaned_data = []

for data in self.json_data:
meaningful_info = self.extract_meaningful_info(data)

html_info = meaningful_info["description"]
soup = BeautifulSoup(html_info, "html.parser")
text = soup.get_text(separator=" ")
padded_text = "\n".join([" " + line for line in text.split("\n")])

meaningful_info["description"] = padded_text
cleaned_data.append(meaningful_info)

return cleaned_data


if __name__ == "__main__":
data = JSONData("../data/latest/meta.json")
cleaned_data = data.clean_data()

with open("../data/latest/cleaned_meta.json", "w") as f:
json.dump(cleaned_data, f, indent=4)
75 changes: 75 additions & 0 deletions figma_sanitizer/text_node_saver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import glob
import json
import os
import sqlite3


# Function to find all text nodes in the JSON object
def find_text_nodes(json_object, result):
if isinstance(json_object, dict):
if json_object.get("type") == "TEXT":
result.append(json_object)
for _, value in json_object.items():
find_text_nodes(value, result)
elif isinstance(json_object, list):
for item in json_object:
find_text_nodes(item, result)

def main():
# Create SQLite database and table
conn = sqlite3.connect("text_nodes.db")
cursor = conn.cursor()

# Delete the table if it already exists
cursor.execute("DROP TABLE IF EXISTS text_nodes")

# Create the table
cursor.execute("""CREATE TABLE text_nodes (
id TEXT PRIMARY KEY,
name TEXT,
type TEXT,
json_data TEXT
)""")

# Loop through matching folders
for folder in glob.glob("../data/samples/figma-samples-5k.min/*"):

# if not a folder, skip
if not os.path.isdir(folder):
continue

json_file = os.path.join(folder, "file.json")

# Extract parent folder name
parent_folder_name = os.path.basename(os.path.dirname(json_file))

# Load JSON data
try:
with open(json_file, "r") as file:
data = json.load(file)
except FileNotFoundError:
print(f"The file {json_file} does not exist.")
# Handle the error or continue with the program execution


# Find all the text nodes
text_nodes = []
find_text_nodes(data, text_nodes)

# Insert text nodes into the database
for node in text_nodes:
# Concatenate parent_folder_name and node's id value
try:
prefixed_id = f"{parent_folder_name}_{node['id']}"
except KeyError as e:
continue

cursor.execute("""INSERT INTO text_nodes (id, name, type, json_data)
VALUES (?, ?, ?, ?)""",
(prefixed_id, node["name"], node["type"], json.dumps(node)))

conn.commit()
conn.close()

if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions figma_stats/count_number.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
class NumberProcessor:
def __init__(self, filename, output_filename):
self.filename = filename
self.output_filename = output_filename

def process(self):
with open(self.filename) as f:
content = f.read()

numbers = content.split(',')
count = len(numbers)

# Create a dictionary to count the occurrences of each number
counts = {}
for n in numbers:
if n is not None:
try:
n = float(n)
if n not in counts:
counts[n] = 1
else:
counts[n] += 1
except ValueError:
pass

# Sort the counts in descending order based on their frequency
sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

with open(self.output_filename, 'w') as f:
f.write(f"Count of numbers: {count}\n")
f.write("Sorted numbers:\n")
for num, freq in sorted_counts:
f.write(f"{num}: {freq}\n")

if __name__ == '__main__':
processor = NumberProcessor('artifacts/top-level-frame-size-stat.txt', 'artifacts/stat-width-sorted.txt')
processor.process()
Loading