Skip to content

Pull Request: Add support for Windows and multi-file CSV export with cleaned values #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ is turned into the following CSV:

1,0,April,1,0,0,0.778582929065,20140312223924,20140312223929,4657771,20236,0
2,0,August,0,0,0,0.123830928525,20140312221818,20140312221822,4360163,11466,0

## UPDATES (4-18-25)
I've added support for Windows so it won't crash when trying to load in Windows. I added a fix to read multi-line inserts. I fixed an overflow error I was getting when setting the field_size_limit. I added funcionality to create a single CSV for each table in the SQL file, CSV names being read dynamically from the SQL file. Also fixed it so it creates a seperate row for each record in the CSV instead of all on one row. And finally added in headers to the CSV files also pulled dynamically from the SQL file.

## License
The code is strung together from other public repos, I'm pretty sure the license is standard MIT License.
170 changes: 112 additions & 58 deletions mysqldump_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,49 @@
import fileinput
import csv
import sys
import re

# This prevents prematurely closed pipes from raising
# an exception in Python
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)
import signal
import sys

if hasattr(signal, 'SIGPIPE'):
signal.signal(signal.SIGPIPE, signal.SIG_DFL)


# allow large content in the dump
csv.field_size_limit(sys.maxsize)
# csv.field_size_limit(sys.maxsize)
csv.field_size_limit(2**31 - 1)


def extract_columns(line):
"""
Extracts column names from the INSERT INTO statement.
Returns a list of column names, or None if not found.
"""
match = re.search(r'INSERT INTO\s+`?\w+`?\s*\(([^)]+)\)', line, re.IGNORECASE)
if match:
cols = match.group(1)
return [col.strip().strip('`') for col in cols.split(',')]
return None


def extract_table_name(line):
"""
Extracts the table name from an INSERT INTO statement.
Handles backticks and spacing.
"""
match = re.search(r'INSERT INTO\s+`?(\w+)`?\s', line, re.IGNORECASE)
return match.group(1) if match else None


def is_insert(line):
"""
Returns true if the line begins a SQL insert statement.
Ignores leading whitespace and case.
"""
return line.startswith('INSERT INTO')
return line.lstrip().upper().startswith('INSERT INTO')


def get_values(line):
Expand All @@ -37,73 +66,98 @@ def values_sanity_check(values):

def parse_values(values, outfile):
"""
Given a file handle and the raw values from a MySQL INSERT
statement, write the equivalent CSV to the file
Parses SQL INSERT values and writes clean CSV rows, removing single quotes around strings.
"""
latest_row = []
values = values.rstrip(';')
tuples = re.findall(r'\([^\)]*\)', values)
writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)

reader = csv.reader([values], delimiter=',',
doublequote=False,
escapechar='\\',
quotechar="'",
strict=True
)
for val in tuples:
# Remove outer parentheses
val = val.strip()[1:-1]

writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
for reader_row in reader:
for column in reader_row:
# If our current string is empty...
if len(column) == 0 or column == 'NULL':
latest_row.append(chr(0))
continue
# If our string starts with an open paren
if column[0] == "(":
# If we've been filling out a row
if len(latest_row) > 0:
# Check if the previous entry ended in
# a close paren. If so, the row we've
# been filling out has been COMPLETED
# as:
# 1) the previous entry ended in a )
# 2) the current entry starts with a (
if latest_row[-1][-1] == ")":
# Remove the close paren.
latest_row[-1] = latest_row[-1][:-1]
writer.writerow(latest_row)
latest_row = []
# If we're beginning a new row, eliminate the
# opening parentheses.
if len(latest_row) == 0:
column = column[1:]
# Add our column to the row we're working on.
latest_row.append(column)
# At the end of an INSERT statement, we'll
# have the semicolon.
# Make sure to remove the semicolon and
# the close paren.
if latest_row[-1][-2:] == ");":
latest_row[-1] = latest_row[-1][:-2]
writer.writerow(latest_row)
# Split respecting commas inside quotes
parts = []
current = ''
in_quote = False
escape = False

for char in val:
if escape:
current += char
escape = False
elif char == '\\':
escape = True
elif char == "'":
in_quote = not in_quote
current += char
elif char == ',' and not in_quote:
parts.append(current.strip())
current = ''
else:
current += char
if current:
parts.append(current.strip())

# Clean each part: remove quotes, handle NULL
clean_row = []
for col in parts:
if col.upper() == 'NULL':
clean_row.append('')
elif col.startswith("'") and col.endswith("'"):
# Strip surrounding quotes and unescape inner quotes
unquoted = col[1:-1].replace("\\'", "'").replace('\\\\', '\\')
clean_row.append(unquoted)
else:
clean_row.append(col)

writer.writerow(clean_row)


def main():
"""
Parse arguments and start the program
"""
# Iterate over all lines in all files
# listed in sys.argv[1:]
# or stdin if no args given.
try:
written_tables = set() # keep track of tables we've written headers for

buffer = ''
for line in fileinput.input():
# Look for an INSERT statement and parse it.
if not is_insert(line):
raise Exception("SQL INSERT statement could not be found!")
values = get_values(line)
if not values_sanity_check(values):
raise Exception("Getting substring of SQL INSERT statement after ' VALUES ' failed!")
parse_values(values, sys.stdout)
line = line.strip()
if not line or line.startswith('--') or line.startswith('/*'):
continue # skip comments and empty lines

buffer += ' ' + line # accumulate SQL statement lines

if line.endswith(';'):
if is_insert(buffer):
table_name = extract_table_name(buffer)
if not table_name:
raise Exception("Could not extract table name from INSERT statement!")

columns = extract_columns(buffer)
if not columns:
raise Exception("Could not extract column names from INSERT statement!")

values = get_values(buffer)
if not values_sanity_check(values):
raise Exception("Getting substring of SQL INSERT statement after ' VALUES ' failed!")

# Open the CSV file and write header if needed
write_header = table_name not in written_tables
with open(f"{table_name}.csv", "a", newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
if write_header:
writer.writerow(columns)
written_tables.add(table_name)

parse_values(values, outfile)


buffer = '' # clear buffer for next statement
except KeyboardInterrupt:
sys.exit(0)


if __name__ == "__main__":
main()