From 87b5eabbd04497dbd2b7b982033dd210e141d62c Mon Sep 17 00:00:00 2001 From: Zuleykha Pavlichenkova Date: Tue, 2 Jul 2024 18:15:07 +0200 Subject: [PATCH 1/5] Point to the most recent version of duckdb submodule --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 70fd6a8..cd4e519 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 70fd6a8a2450c1e2a7d0547d4c0666a649dc378e +Subproject commit cd4e5194a7de618f8ac1d8bc1be423ab2fa1bd85 From c997853682a7b0bcf2e43e9bc4a8590e905fd0ee Mon Sep 17 00:00:00 2001 From: Zuleykha Pavlichenkova Date: Thu, 4 Jul 2024 09:34:23 +0200 Subject: [PATCH 2/5] update duckdb submodule to resolve merge conflict --- duckdb | 2 +- scripts/extension-upload.sh | 90 --------- scripts/fuzzer_helper.py | 207 ++++++++++++++++++++ scripts/parser_test.py | 21 ++ scripts/reduce_sql.py | 368 ++++++++++++++++++++++++++++++++++++ scripts/run_fuzzer.py | 199 +++++++++++++++++++ scripts/run_sqlancer.py | 150 +++++++++++++++ scripts/run_test_list.py | 65 +++++++ scripts/runsqlsmith.py | 52 +++++ scripts/try_timeout.py | 48 +++++ 10 files changed, 1111 insertions(+), 91 deletions(-) delete mode 100755 scripts/extension-upload.sh create mode 100644 scripts/fuzzer_helper.py create mode 100644 scripts/parser_test.py create mode 100644 scripts/reduce_sql.py create mode 100644 scripts/run_fuzzer.py create mode 100644 scripts/run_sqlancer.py create mode 100644 scripts/run_test_list.py create mode 100644 scripts/runsqlsmith.py create mode 100644 scripts/try_timeout.py diff --git a/duckdb b/duckdb index cd4e519..7b2cdc7 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit cd4e5194a7de618f8ac1d8bc1be423ab2fa1bd85 +Subproject commit 7b2cdc786bf64ed776941a3e4a65722941b957a6 diff --git a/scripts/extension-upload.sh b/scripts/extension-upload.sh deleted file mode 100755 index 9fd5b39..0000000 --- a/scripts/extension-upload.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# Extension upload script - -# Usage: ./extension-upload.sh -# : Name of the extension -# : Version (commit / version tag) of the extension -# : Version (commit / version tag) of DuckDB -# : Architecture target of the extension binary -# : S3 bucket to upload to -# : Set this as the latest version ("true" / "false", default: "false") -# : Set this as a versioned version that will prevent its deletion - -set -e - -if [[ $4 == wasm* ]]; then - ext="/tmp/extension/$1.duckdb_extension.wasm" -else - ext="/tmp/extension/$1.duckdb_extension" -fi - -echo $ext - -script_dir="$(dirname "$(readlink -f "$0")")" - -# calculate SHA256 hash of extension binary -cat $ext > $ext.append - -if [[ $4 == wasm* ]]; then - # 0 for custom section - # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256) - # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02] - echo -n -e '\x00' >> $ext.append - echo -n -e '\x93\x02' >> $ext.append - # 10 in hex = 16 in decimal, lenght of name, 1 byte - echo -n -e '\x10' >> $ext.append - echo -n -e 'duckdb_signature' >> $ext.append - # the name of the WebAssembly custom section, 16 bytes - # 100 in hex, 256 in decimal - # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)], - # for a grand total of 2 bytes - echo -n -e '\x80\x02' >> $ext.append -fi - -# (Optionally) Sign binary -if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then - echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem - $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash - openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign - rm -f private.pem -fi - -# Signature is always there, potentially defaulting to 256 zeros -truncate -s 256 $ext.sign - -# append signature to extension binary -cat $ext.sign >> $ext.append - -# compress extension binary -if [[ $4 == wasm_* ]]; then - brotli < $ext.append > "$ext.compressed" -else - gzip < $ext.append > "$ext.compressed" -fi - -set -e - -# Abort if AWS key is not set -if [ -z "$AWS_ACCESS_KEY_ID" ]; then - echo "No AWS key found, skipping.." - exit 0 -fi - -# upload versioned version -if [[ $7 = 'true' ]]; then - if [[ $4 == wasm* ]]; then - aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" - else - aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read - fi -fi - -# upload to latest version -if [[ $6 = 'true' ]]; then - if [[ $4 == wasm* ]]; then - aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" - else - aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read - fi -fi diff --git a/scripts/fuzzer_helper.py b/scripts/fuzzer_helper.py new file mode 100644 index 0000000..dd82e06 --- /dev/null +++ b/scripts/fuzzer_helper.py @@ -0,0 +1,207 @@ +import json +import requests +import sys +import os +import subprocess +import reduce_sql +import fuzzer_helper + + +USERNAME = 'fuzzerofducks' + +REPO_OWNER = 'duckdb' +REPO_NAME = 'duckdb-fuzzer' + +fuzzer_desc = '''Issue found by ${FUZZER} on git commit hash [${SHORT_HASH}](https://github.com/duckdb/duckdb/commit/${FULL_HASH}) using seed ${SEED}. +''' + +header = '''### To Reproduce +```sql +''' + +middle = ''' +``` + +### Error Message +``` +''' + +footer = ''' +```''' + + +# github stuff +def issue_url(): + return 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME) + + +def get_token(): + if 'FUZZEROFDUCKSKEY' not in os.environ: + print("FUZZEROFDUCKSKEY not found in environment variables") + exit(1) + token = os.environ['FUZZEROFDUCKSKEY'] + if len(token) == 0: + print("FUZZEROFDUCKSKEY is set but is empty") + exit(1) + + if len(token) != 40: + print("Incorrect length for FUZZEROFDUCKSKEY") + exit(1) + return token + + +def create_session(): + # Create an authenticated session to create the issue + session = requests.Session() + session.headers.update({'Authorization': 'token %s' % (get_token(),)}) + return session + + +def make_github_issue(title, body): + if len(title) > 240: + # avoid title is too long error (maximum is 256 characters) + title = title[:240] + '...' + session = create_session() + url = issue_url() + issue = {'title': title, 'body': body} + r = session.post(url, json.dumps(issue)) + if r.status_code == 201: + print('Successfully created Issue "%s"' % title) + else: + print('Could not create Issue "%s"' % title) + print('Response:', r.content.decode('utf8')) + raise Exception("Failed to create issue") + + +def get_github_issues(page): + session = create_session() + url = issue_url() + '?per_page=100&page=' + str(page) + r = session.get(url) + if r.status_code != 200: + print('Failed to get list of issues') + print('Response:', r.content.decode('utf8')) + raise Exception("Failed to get list of issues") + return json.loads(r.content.decode('utf8')) + + +def close_github_issue(number): + session = create_session() + url = issue_url() + '/' + str(number) + params = {'state': 'closed'} + r = session.patch(url, json.dumps(params)) + if r.status_code == 200: + print(f'Successfully closed Issue "{number}"') + else: + print(f'Could not close Issue "{number}" (status code {r.status_code})') + print('Response:', r.content.decode('utf8')) + raise Exception("Failed to close issue") + + +def label_github_issue(number, label): + session = create_session() + url = issue_url() + '/' + str(number) + params = {'labels': [label]} + r = session.patch(url, json.dumps(params)) + if r.status_code == 200: + print(f'Successfully labeled Issue "{number}"') + else: + print(f'Could not label Issue "{number}" (status code {r.status_code})') + print('Response:', r.content.decode('utf8')) + raise Exception("Failed to label issue") + + +def extract_issue(body, nr): + try: + splits = body.split(middle) + sql = splits[0].split(header)[1] + error = splits[1][: -len(footer)] + return (sql, error) + except: + print(f"Failed to extract SQL/error message from issue {nr}") + print(body) + return None + + +def run_shell_command_batch(shell, cmd): + command = [shell, '--batch', '-init', '/dev/null'] + + try: + res = subprocess.run( + command, input=bytearray(cmd, 'utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=300 + ) + except subprocess.TimeoutExpired: + print(f"TIMEOUT... {cmd}") + return ("", "", 0, True) + stdout = res.stdout.decode('utf8').strip() + stderr = res.stderr.decode('utf8').strip() + return (stdout, stderr, res.returncode, False) + + +def test_reproducibility(shell, issue, current_errors, perform_check): + extract = extract_issue(issue['body'], issue['number']) + labels = issue['labels'] + label_timeout = False + for label in labels: + if label['name'] == 'timeout': + label_timeout = True + if extract is None: + # failed extract: leave the issue as-is + return True + sql = extract[0] + ';' + error = extract[1] + if perform_check is True and label_timeout is False: + print(f"Checking issue {issue['number']}...") + (stdout, stderr, returncode, is_timeout) = run_shell_command_batch(shell, sql) + if is_timeout: + label_github_issue(issue['number'], 'timeout') + else: + if returncode == 0: + return False + if not fuzzer_helper.is_internal_error(stderr): + return False + # issue is still reproducible + current_errors[error] = issue + return True + + +def extract_github_issues(shell, perform_check): + current_errors = dict() + for p in range(1, 10): + issues = get_github_issues(p) + for issue in issues: + # check if the github issue is still reproducible + if not test_reproducibility(shell, issue, current_errors, perform_check): + # the issue appears to be fixed - close the issue + print(f"Failed to reproduce issue {issue['number']}, closing...") + close_github_issue(int(issue['number'])) + return current_errors + + +def file_issue(cmd, error_msg, fuzzer, seed, hash): + # issue is new, file it + print("Filing new issue to Github") + + title = error_msg + body = ( + fuzzer_desc.replace("${FUZZER}", fuzzer) + .replace("${FULL_HASH}", hash) + .replace("${SHORT_HASH}", hash[:5]) + .replace("${SEED}", str(seed)) + ) + body += header + cmd + middle + error_msg + footer + print(title, body) + make_github_issue(title, body) + + +def is_internal_error(error): + if 'differs from original result' in error: + return True + if 'INTERNAL' in error: + return True + if 'signed integer overflow' in error: + return True + if 'Sanitizer' in error or 'sanitizer' in error: + return True + if 'runtime error' in error: + return True + return False diff --git a/scripts/parser_test.py b/scripts/parser_test.py new file mode 100644 index 0000000..e09fbe3 --- /dev/null +++ b/scripts/parser_test.py @@ -0,0 +1,21 @@ +from sqllogictest import SQLParserException, SQLLogicParser, SQLLogicTest + +from typing import Optional +import argparse + + +def main(): + parser = argparse.ArgumentParser(description="SQL Logic Parser") + parser.add_argument("filename", type=str, help="Path to the SQL logic file") + args = parser.parse_args() + + filename = args.filename + + parser = SQLLogicParser() + out: Optional[SQLLogicTest] = parser.parse(filename) + if not out: + raise SQLParserException(f"Test {filename} could not be parsed") + + +if __name__ == "__main__": + main() diff --git a/scripts/reduce_sql.py b/scripts/reduce_sql.py new file mode 100644 index 0000000..245a89b --- /dev/null +++ b/scripts/reduce_sql.py @@ -0,0 +1,368 @@ +import re +import subprocess +import time +import os +import fuzzer_helper +import multiprocessing +import sqlite3 + +# this script can be used as a library, but can also be directly called +# example usage: +# python3 scripts/reduce_sql.py --load load.sql --exec exec.sql + +try: + multiprocessing.set_start_method('fork') +except RuntimeError: + pass +get_reduced_query = ''' +SELECT * FROM reduce_sql_statement('${QUERY}'); +''' + + +class MultiStatementManager: + delimiter = ';' + + def __init__(self, multi_statement): + # strip whitespace, then the final ';', and split on all ';' inbetween. + statements = list( + map(lambda x: x.strip(), multi_statement.strip().strip(';').split(MultiStatementManager.delimiter)) + ) + self.statements = [] + for stmt in statements: + if len(stmt) > 0: + self.statements.append(stmt.strip() + ";") + + def is_multi_statement(sql_statement): + if len(sql_statement.split(';')) > 1: + return True + return False + + def get_last_statement(self): + return self.statements[-1] + + +def sanitize_error(err): + err = re.sub(r'Error: near line \d+: ', '', err) + err = err.replace(os.getcwd() + '/', '') + err = err.replace(os.getcwd(), '') + if 'AddressSanitizer' in err: + match = re.search(r'[ \t]+[#]0 ([A-Za-z0-9]+) ([^\n]+)', err).groups()[1] + err = 'AddressSanitizer error ' + match + return err + + +def run_shell_command(shell, cmd): + command = [shell, '-csv', '--batch', '-init', '/dev/null'] + + res = subprocess.run(command, input=bytearray(cmd, 'utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout = res.stdout.decode('utf8').strip() + stderr = res.stderr.decode('utf8').strip() + return (stdout, stderr, res.returncode) + + +def get_reduced_sql(shell, sql_query): + reduce_query = get_reduced_query.replace('${QUERY}', sql_query.replace("'", "''")) + (stdout, stderr, returncode) = run_shell_command(shell, reduce_query) + if returncode != 0: + print(stdout) + print(stderr) + raise Exception("Failed to reduce query") + reduce_candidates = [] + for line in stdout.split('\n'): + reduce_candidates.append(line.strip('"').replace('""', '"')) + return reduce_candidates[1:] + + +def reduce(sql_query, data_load, shell, error_msg, max_time_seconds=300): + start = time.time() + while True: + found_new_candidate = False + reduce_candidates = get_reduced_sql(shell, sql_query) + for reduce_candidate in reduce_candidates: + if reduce_candidate == sql_query: + continue + current_time = time.time() + if current_time - start > max_time_seconds: + break + + (stdout, stderr, returncode) = run_shell_command(shell, data_load + reduce_candidate) + new_error = sanitize_error(stderr) + if new_error == error_msg: + sql_query = reduce_candidate + found_new_candidate = True + print("Found new reduced query") + print("=======================") + print(sql_query) + print("=======================") + break + if not found_new_candidate: + break + return sql_query + + +def is_ddl_query(query): + query = query.lower() + if 'create' in query or 'insert' in query or 'update' in query or 'delete' in query: + return True + return False + + +def initial_cleanup(query_log): + query_log = query_log.replace('SELECT * FROM pragma_version()\n', '') + return query_log + + +def run_queries_until_crash_mp(queries, result_file): + import duckdb + + con = duckdb.connect() + sqlite_con = sqlite3.connect(result_file) + sqlite_con.execute('CREATE TABLE queries(id INT, text VARCHAR)') + sqlite_con.execute('CREATE TABLE result(text VARCHAR)') + sqlite_con.execute("INSERT INTO result VALUES ('__CRASH__')") + id = 1 + is_internal_error = False + for q in queries: + # insert the current query into the database + # we do this pre-emptively in case the program crashes + sqlite_con.execute('INSERT INTO queries VALUES (?, ?)', (id, q)) + sqlite_con.commit() + + keep_query = False + try: + con.execute(q) + keep_query = is_ddl_query(q) + except Exception as e: + exception_error = str(e) + is_internal_error = fuzzer_helper.is_internal_error(exception_error) + if is_internal_error: + keep_query = True + sqlite_con.execute('UPDATE result SET text=?', (exception_error,)) + if not keep_query: + sqlite_con.execute('DELETE FROM queries WHERE id=?', (id,)) + if is_internal_error: + # found internal error: no need to try further queries + break + id += 1 + if not is_internal_error: + # failed to reproduce: delete result + sqlite_con.execute('DELETE FROM result') + sqlite_con.commit() + sqlite_con.close() + + +def run_queries_until_crash(queries): + sqlite_file = 'cleaned_queries.db' + if os.path.isfile(sqlite_file): + os.remove(sqlite_file) + # run the queries in a separate process because it might crash + p = multiprocessing.Process(target=run_queries_until_crash_mp, args=(queries, sqlite_file)) + p.start() + p.join() + + # read the queries back from the file + sqlite_con = sqlite3.connect(sqlite_file) + queries = sqlite_con.execute('SELECT text FROM queries ORDER BY id').fetchall() + results = sqlite_con.execute('SELECT text FROM result').fetchall() + sqlite_con.close() + if len(results) == 0: + # no internal error or crash found + return (None, None) + assert len(results) == 1 + return ([x[0] for x in queries], results[0][0]) + + +def cleanup_irrelevant_queries(query_log): + query_log = initial_cleanup(query_log) + + queries = [x for x in query_log.split(';\n') if len(x) > 0] + return run_queries_until_crash(queries) + + +# def reduce_internal(start, sql_query, data_load, queries_final, shell, error_msg, max_time_seconds=300): + + +def reduce_query_log_query(start, shell, queries, query_index, max_time_seconds): + new_query_list = queries[:] + sql_query = queries[query_index] + while True: + found_new_candidate = False + reduce_candidates = get_reduced_sql(shell, sql_query) + for reduce_candidate in reduce_candidates: + if reduce_candidate == sql_query: + continue + current_time = time.time() + if current_time - start > max_time_seconds: + break + + new_query_list[query_index] = reduce_candidate + (_, error) = run_queries_until_crash(new_query_list) + + if error is not None: + sql_query = reduce_candidate + found_new_candidate = True + print("Found new reduced query") + print("=======================") + print(sql_query) + print("========ERROR==========") + print(error) + print("=======================") + print("") + break + if not found_new_candidate: + break + return sql_query + + +def reduce_multi_statement(sql_queries, local_shell, local_data_load): + reducer = MultiStatementManager(sql_queries) + last_statement = reducer.get_last_statement() + print(f"testing if just last statement of multi statement creates the error") + (stdout, stderr, returncode) = run_shell_command(local_shell, local_data_load + last_statement) + expected_error = sanitize_error(stderr).strip() + if len(expected_error) > 0: + # reduce just the last statement + return reduce(last_statement, local_data_load, local_shell, expected_error, int(args.max_time)) + queries = reduce_query_log(reducer.statements, local_shell, [local_data_load]) + return "\n".join(queries) + + +def reduce_query_log(queries, shell, data_load=[], max_time_seconds=300): + start = time.time() + current_index = 0 + # first try to remove as many queries as possible + while current_index < len(queries): + print("Attempting to remove query at position %d (of %d total queries)" % (current_index, len(queries))) + current_time = time.time() + if current_time - start > max_time_seconds: + break + # remove the query at "current_index" + new_queries = queries[:current_index] + queries[current_index + 1 :] + new_queries_with_data = data_load + new_queries + # try to run the queries and check if we still get the same error + (new_queries_x, current_error) = run_queries_until_crash(new_queries_with_data) + if current_error is None: + # cannot remove this query without invalidating the test case + current_index += 1 + else: + # we can remove this query + queries = new_queries + # now try to reduce individual queries + for i in range(len(queries)): + if is_ddl_query(queries[i]): + continue + current_time = time.time() + if current_time - start > max_time_seconds: + break + queries[i] = reduce_query_log_query(start, shell, queries, i, max_time_seconds) + return queries + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Reduce a problematic SQL query') + parser.add_argument( + '--shell', dest='shell', action='store', help='Path to the shell executable', default='build/debug/duckdb' + ) + parser.add_argument('--load', dest='load', action='store', help='Path to the data load script', required=True) + parser.add_argument('--exec', dest='exec', action='store', help='Path to the executable script', required=True) + parser.add_argument( + '--inplace', dest='inplace', action='store_true', help='If true, overrides the exec script with the final query' + ) + parser.add_argument( + '--max-time', dest='max_time', action='store', help='Maximum time in seconds to run the reducer', default=300 + ) + + args = parser.parse_args() + print("Starting reduce process") + + shell = args.shell + data_load = open(args.load).read() + sql_query = open(args.exec).read() + (stdout, stderr, returncode) = run_shell_command(shell, data_load + sql_query) + expected_error = sanitize_error(stderr).strip() + if len(expected_error) == 0: + print("===================================================") + print("Could not find expected error - no error encountered") + print("===================================================") + exit(1) + + print("===================================================") + print("Found expected error") + print("===================================================") + print(expected_error) + print("===================================================") + + if MultiStatementManager.is_multi_statement(sql_query): + final_query = reduce_multi_statement(sql_query, shell, data_load) + else: + final_query = reduce(sql_query, data_load, shell, expected_error, int(args.max_time)) + + print("Found final reduced query") + print("===================================================") + print(final_query) + print("===================================================") + if args.inplace: + print(f"Writing to file {args.exec}") + with open(args.exec, 'w+') as f: + f.write(final_query) + + +# Example usage: +# error_msg = 'INTERNAL Error: Assertion triggered in file "/Users/myth/Programs/duckdb-bugfix/src/common/types/data_chunk.cpp" on line 41: !types.empty()' +# shell = 'build/debug/duckdb' +# data_load = 'create table all_types as select * from test_all_types();' +# sql_query = ''' +# select +# subq_0.c0 as c0, +# contains( +# cast(cast(nullif( +# argmax( +# cast(case when 0 then (select varchar from main.all_types limit 1 offset 5) +# else (select varchar from main.all_types limit 1 offset 5) +# end +# as varchar), +# cast(decode( +# cast(cast(null as blob) as blob)) as varchar)) over (partition by subq_0.c1 order by subq_0.c1), +# current_schema()) as varchar) as varchar), +# cast(cast(nullif(cast(null as varchar), +# cast(null as varchar)) as varchar) as varchar)) as c1, +# (select min(time) from main.all_types) +# as c2, +# subq_0.c1 as c3, +# subq_0.c1 as c4, +# cast(nullif(subq_0.c1, +# subq_0.c1) as decimal(4,1)) as c5 +# from +# (select +# ref_0.timestamp_ns as c0, +# case when (EXISTS ( +# select +# ref_0.timestamp_ns as c0, +# ref_0.timestamp_ns as c1, +# (select timestamp_tz from main.all_types limit 1 offset 4) +# as c2, +# ref_1.int_array as c3, +# ref_1.dec_4_1 as c4, +# ref_0.utinyint as c5, +# ref_1.int as c6, +# ref_0.double as c7, +# ref_0.medium_enum as c8, +# ref_1.array_of_structs as c9, +# ref_1.varchar as c10 +# from +# main.all_types as ref_1 +# where ref_1.varchar ~~~ ref_1.varchar +# limit 28)) +# or (ref_0.varchar ~~~ ref_0.varchar) then ref_0.dec_4_1 else ref_0.dec_4_1 end +# as c1 +# from +# main.all_types as ref_0 +# where (0) +# and (ref_0.varchar ~~ ref_0.varchar)) as subq_0 +# where writefile() !~~* writefile() +# limit 88 +# ''' +# +# print(reduce(sql_query, data_load, shell, error_msg)) diff --git a/scripts/run_fuzzer.py b/scripts/run_fuzzer.py new file mode 100644 index 0000000..f2bb8f3 --- /dev/null +++ b/scripts/run_fuzzer.py @@ -0,0 +1,199 @@ +import json +import requests +import sys +import os +import subprocess +import reduce_sql +import fuzzer_helper +import random + +seed = -1 + +fuzzer = None +db = None +shell = None +perform_checks = True +dry = False +for param in sys.argv: + if param == '--sqlsmith': + fuzzer = 'sqlsmith' + elif param == '--duckfuzz': + fuzzer = 'duckfuzz' + elif param == '--duckfuzz_functions': + fuzzer = 'duckfuzz_functions' + elif param == '--alltypes': + db = 'alltypes' + elif param == '--tpch': + db = 'tpch' + elif param == '--emptyalltypes': + db = 'emptyalltypes' + elif param == '--no_checks': + perform_checks = False + elif param.startswith('--shell='): + shell = param.replace('--shell=', '') + elif param.startswith('--seed='): + seed = int(param.replace('--seed=', '')) + elif param.startswith('--dry'): + dry = True + +if fuzzer is None: + print("Unrecognized fuzzer to run, expected e.g. --sqlsmith or --duckfuzz") + exit(1) + +if db is None: + print("Unrecognized database to run on, expected either --tpch, --alltypes or --emptyalltypes") + exit(1) + +if shell is None: + print("Unrecognized path to shell, expected e.g. --shell=build/debug/duckdb") + exit(1) + +if seed < 0: + seed = random.randint(0, 2**30) + +git_hash = os.getenv('DUCKDB_HASH') + + +def create_db_script(db): + if db == 'alltypes': + return 'create table all_types as select * exclude(small_enum, medium_enum, large_enum) from test_all_types();' + elif db == 'tpch': + return 'call dbgen(sf=0.1);' + elif db == 'emptyalltypes': + return 'create table all_types as select * exclude(small_enum, medium_enum, large_enum) from test_all_types() limit 0;' + else: + raise Exception("Unknown database creation script") + + +def run_fuzzer_script(fuzzer): + if fuzzer == 'sqlsmith': + return "call sqlsmith(max_queries=${MAX_QUERIES}, seed=${SEED}, verbose_output=1, log='${LAST_LOG_FILE}', complete_log='${COMPLETE_LOG_FILE}');" + elif fuzzer == 'duckfuzz': + return "call fuzzyduck(max_queries=${MAX_QUERIES}, seed=${SEED}, verbose_output=1, log='${LAST_LOG_FILE}', complete_log='${COMPLETE_LOG_FILE}');" + elif fuzzer == 'duckfuzz_functions': + return "call fuzz_all_functions(seed=${SEED}, verbose_output=1, log='${LAST_LOG_FILE}', complete_log='${COMPLETE_LOG_FILE}');" + else: + raise Exception("Unknown fuzzer type") + + +def get_fuzzer_name(fuzzer): + if fuzzer == 'sqlsmith': + return 'SQLSmith' + elif fuzzer == 'duckfuzz': + return 'DuckFuzz' + elif fuzzer == 'duckfuzz_functions': + return 'DuckFuzz (Functions)' + else: + return 'Unknown' + + +def run_shell_command(cmd): + command = [shell, '--batch', '-init', '/dev/null'] + + res = subprocess.run(command, input=bytearray(cmd, 'utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout = res.stdout.decode('utf8', 'ignore').strip() + stderr = res.stderr.decode('utf8', 'ignore').strip() + return (stdout, stderr, res.returncode) + + +# first get a list of all github issues, and check if we can still reproduce them + +if dry: + current_errors = [] +else: + current_errors = fuzzer_helper.extract_github_issues(shell, perform_checks) + +max_queries = 2000 +last_query_log_file = 'sqlsmith.log' +complete_log_file = 'sqlsmith.complete.log' + +print( + f'''========================================== + RUNNING {fuzzer} on {db} +==========================================''' +) + +load_script = create_db_script(db) +fuzzer_name = get_fuzzer_name(fuzzer) +fuzzer = ( + run_fuzzer_script(fuzzer) + .replace('${MAX_QUERIES}', str(max_queries)) + .replace('${LAST_LOG_FILE}', last_query_log_file) + .replace('${COMPLETE_LOG_FILE}', complete_log_file) + .replace('${SEED}', str(seed)) +) + +print(load_script) +print(fuzzer) + +cmd = load_script + "\n" + fuzzer + +print("==========================================") + +(stdout, stderr, returncode) = run_shell_command(cmd) + +print( + f'''========================================== + FINISHED RUNNING +==========================================''' +) +print("============== STDOUT ================") +print(stdout) +print("============== STDERR =================") +print(stderr) +print("==========================================") + +print(returncode) +if returncode == 0: + print("============== SUCCESS ================") + exit(0) + +print("============== FAILURE ================") +print("Attempting to reproduce and file issue...") + +# run the last query, and see if the issue persists +with open(last_query_log_file, 'r') as f: + last_query = f.read() + +with open(complete_log_file, 'r') as f: + all_queries = f.read() + +(stdout, stderr, returncode) = run_shell_command(load_script + all_queries) +if returncode == 0: + print("Failed to reproduce the issue...") + exit(0) + +print("============== STDOUT ================") +print(stdout) +print("============== STDERR =================") +print(stderr) +print("==========================================") +if not fuzzer_helper.is_internal_error(stderr): + print("Failed to reproduce the internal error") + exit(0) + +error_msg = reduce_sql.sanitize_error(stderr) + +print("=========================================") +print(" Reproduced successfully ") +print("=========================================") + +# check if this is a duplicate issue +if error_msg in current_errors: + print("Skip filing duplicate issue") + print( + "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" + + str(current_errors[error_msg]['number']) + ) + exit(0) + +print("=========================================") +print(" Attempting to reduce query ") +print("=========================================") +# try to reduce the query as much as possible +# reduce_multi_statement checks just the last statement first as a heuristic to see if +# only the last statement causes the error. +required_queries = reduce_sql.reduce_multi_statement(all_queries, shell, load_script) +cmd = load_script + '\n' + last_query + "\n" + +fuzzer_helper.file_issue(cmd, error_msg, fuzzer_name, seed, git_hash) diff --git a/scripts/run_sqlancer.py b/scripts/run_sqlancer.py new file mode 100644 index 0000000..2646de4 --- /dev/null +++ b/scripts/run_sqlancer.py @@ -0,0 +1,150 @@ +import os +import random +import subprocess +import sys +import reduce_sql +import fuzzer_helper + +persistent = False +sqlancer_dir = 'sqlancer' +seed = None +timeout = 600 +threads = 1 +num_queries = 1000 +shell = None + +# python3 scripts/run_sqlancer.py --sqlancer=/Users/myth/Programs/sqlancer --shell=build/debug/duckdb --seed=0 +for arg in sys.argv: + if arg == '--persistent': + persistent = True + elif arg.startswith('--sqlancer='): + sqlancer_dir = arg.replace('--sqlancer=', '') + elif arg.startswith('--seed='): + seed = int(arg.replace('--seed=', '')) + elif arg.startswith('--timeout='): + timeout = int(arg.replace('--timeout=', '')) + elif arg.startswith('--threads='): + threads = int(arg.replace('--threads=', '')) + elif arg.startswith('--num-queries='): + num_queries = int(arg.replace('--num-queries=', '')) + elif arg.startswith('--shell='): + shell = arg.replace('--shell=', '') + +if shell is None: + print("Unrecognized path to shell, expected e.g. --shell=build/debug/duckdb") + exit(1) + +if not os.path.isfile(shell): + print(f"Could not find shell \"{shell}\"") + exit(1) + +if seed is None: + seed = random.randint(0, 2**30) + +git_hash = fuzzer_helper.get_github_hash() + +targetdir = os.path.join(sqlancer_dir, 'target') +filenames = os.listdir(targetdir) +found_filename = "" +for fname in filenames: + if 'sqlancer-' in fname.lower(): + found_filename = fname + break + +if not found_filename: + print("FAILED TO RUN SQLANCER") + print("Could not find target file sqlancer/target/sqlancer-*.jar") + exit(1) + +command_prefix = ['java'] +if persistent: + command_prefix += ['-Dduckdb.database.file=/tmp/lancer_duckdb_db'] +command_prefix += ['-jar', os.path.join(targetdir, found_filename)] + +seed_text = '' +if seed is not None: + seed_text = f'--random-seed {seed}' + +base_cmd = f'--num-queries {num_queries} --num-threads {threads} {seed_text} --log-each-select=true --timeout-seconds {timeout} duckdb' +command = [x for x in base_cmd.split(' ') if len(x) > 0] + +print('--------------------- RUNNING SQLANCER ----------------------') +print(' '.join(command_prefix + command)) + +subprocess = subprocess.Popen(command_prefix + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) +out = subprocess.stdout.read() +err = subprocess.stderr.read() +subprocess.wait() + +if subprocess.returncode == 0: + print('--------------------- SQLANCER SUCCESS ----------------------') + print('SQLANCER EXITED WITH CODE ' + str(subprocess.returncode)) + exit(0) + +print('--------------------- SQLANCER FAILURE ----------------------') +print('SQLANCER EXITED WITH CODE ' + str(subprocess.returncode)) +print('--------------------- SQLANCER ERROR LOG ----------------------') +print(err.decode('utf8', 'ignore')) +print('--------------------- SQLancer Logs ----------------------') +print(out.decode('utf8', 'ignore')) +try: + with open('duckdb-queries.log', 'r') as f: + text = f.read() + print('--------------------- DuckDB Logs ----------------------') + print(text) +except: + pass + + +with open('duckdb-queries.log', 'r') as f: + query_log = f.read() + +# clean up any irrelevant SELECT statements and failing DDL statements +(queries, expected_error) = reduce_sql.cleanup_irrelevant_queries(query_log) +if queries is None: + print('----------------------------------------------') + print("Failed to reproduce SQLancer error!") + print('----------------------------------------------') + exit(0) + +print('----------------------------------------------') +print("Found query log that produces the following error") +print('----------------------------------------------') +if expected_error == '__CRASH__': + print('CRASH!') +else: + print(expected_error) + +print('----------------------------------------------') +print("Starting reduction process") +print('----------------------------------------------') + +# clean up queries from the query log by trying to remove queries one by one +queries = reduce_sql.reduce_query_log(queries, shell, []) + +reduced_test_case = ';\n'.join(queries) +print('----------------------------------------------') +print("Found reproducible test case") +print('----------------------------------------------') +print(reduced_test_case) + +(stdout, stderr, returncode) = reduce_sql.run_shell_command(shell, reduced_test_case) +error_msg = reduce_sql.sanitize_error(stderr) + +print('----------------------------------------------') +print("Fetching github issues") +print('----------------------------------------------') + +# first get a list of all github issues, and check if we can still reproduce them +current_errors = fuzzer_helper.extract_github_issues(shell) + +# check if this is a duplicate issue +if error_msg in current_errors: + print("Skip filing duplicate issue") + print( + "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" + + str(current_errors[error_msg]['number']) + ) + exit(0) + +fuzzer_helper.file_issue(reduced_test_case, error_msg, "SQLancer", seed, git_hash) diff --git a/scripts/run_test_list.py b/scripts/run_test_list.py new file mode 100644 index 0000000..3ba3426 --- /dev/null +++ b/scripts/run_test_list.py @@ -0,0 +1,65 @@ +import sys +import subprocess +import re +import os + +# wheth +no_exit = False +for i in range(len(sys.argv)): + if sys.argv[i] == '--no-exit': + no_exit = True + del sys.argv[i] + i -= 1 + +if len(sys.argv) < 2: + print("Expected usage: python3 scripts/run_test_list.py build/debug/test/unittest [--no-exit]") + exit(1) +unittest_program = sys.argv[1] +extra_args = [] +if len(sys.argv) > 2: + extra_args = [sys.argv[2]] + + +test_cases = [] +for line in sys.stdin: + if len(line.strip()) == 0: + continue + splits = line.rsplit('\t', 1) + test_cases.append(splits[0]) + +test_count = len(test_cases) +return_code = 0 +for test_number in range(test_count): + sys.stdout.write("[" + str(test_number) + "/" + str(test_count) + "]: " + test_cases[test_number]) + sys.stdout.flush() + res = subprocess.run([unittest_program, test_cases[test_number]], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout = res.stdout.decode('utf8') + stderr = res.stderr.decode('utf8') + if res.returncode is not None and res.returncode != 0: + print("FAILURE IN RUNNING TEST") + print( + """-------------------- +RETURNCODE +-------------------- +""" + ) + print(res.returncode) + print( + """-------------------- +STDOUT +-------------------- +""" + ) + print(stdout) + print( + """-------------------- +STDERR +-------------------- +""" + ) + print(stderr) + return_code = 1 + if not no_exit: + break + +exit(return_code) diff --git a/scripts/runsqlsmith.py b/scripts/runsqlsmith.py new file mode 100644 index 0000000..9569750 --- /dev/null +++ b/scripts/runsqlsmith.py @@ -0,0 +1,52 @@ +# run SQL smith and collect breaking queries +import os +import re +import subprocess +import sys +import sqlite3 +from python_helpers import open_utf8 + +sqlsmith_db = 'sqlsmith.db' +sqlsmith_test_dir = 'test/sqlsmith/queries' + +export_queries = False + +con = sqlite3.connect(sqlsmith_db) +c = con.cursor() + +if len(sys.argv) == 2: + if sys.argv[1] == '--export': + export_queries = True + elif sys.argv[1] == '--reset': + c.execute('DROP TABLE IF EXISTS sqlsmith_errors') + else: + print('Unknown query option ' + sys.argv[1]) + exit(1) + +if export_queries: + c.execute('SELECT query FROM sqlsmith_errors') + results = c.fetchall() + for fname in os.listdir(sqlsmith_test_dir): + os.remove(os.path.join(sqlsmith_test_dir, fname)) + + for i in range(len(results)): + with open(os.path.join(sqlsmith_test_dir, 'sqlsmith-%d.sql' % (i + 1)), 'w+') as f: + f.write(results[i][0] + "\n") + exit(0) + + +def run_sqlsmith(): + subprocess.call(['build/debug/third_party/sqlsmith/sqlsmith', '--duckdb=:memory:']) + + +c.execute('CREATE TABLE IF NOT EXISTS sqlsmith_errors(query VARCHAR)') + +while True: + # run SQL smith + run_sqlsmith() + # get the breaking query + with open_utf8('sqlsmith.log', 'r') as f: + text = re.sub('[ \t\n]+', ' ', f.read()) + + c.execute('INSERT INTO sqlsmith_errors VALUES (?)', (text,)) + con.commit() diff --git a/scripts/try_timeout.py b/scripts/try_timeout.py new file mode 100644 index 0000000..78bd321 --- /dev/null +++ b/scripts/try_timeout.py @@ -0,0 +1,48 @@ +import os +import sys +import subprocess +import threading + +if len(sys.argv) < 3: + print("Expected python3 scripts/try_timeout.py --timeout=[timeout] --retry=[retries] [cmd] [options...]") + print("Timeout should be given in seconds") + exit(1) + +timeout = int(sys.argv[1].replace("--timeout=", "")) +retries = int(sys.argv[2].replace("--retry=", "")) +cmd = sys.argv[3:] + + +class Command(object): + def __init__(self, cmd): + self.cmd = cmd + self.process = None + + def run(self, timeout): + self.process = None + + def target(): + self.process = subprocess.Popen(self.cmd) + self.process.communicate() + + thread = threading.Thread(target=target) + thread.start() + + thread.join(timeout) + if thread.is_alive(): + print('Terminating process: process exceeded timeout of ' + str(timeout) + ' seconds') + self.process.terminate() + thread.join() + if self.process is None: + return 1 + return self.process.returncode + + +for i in range(retries): + print("Attempting to run command \"" + ' '.join(cmd) + '"') + command = Command(cmd) + returncode = command.run(timeout) + if returncode == 0: + exit(0) + +exit(1) From d789c661081ab44cfe9988a905c8cbd5376e9c9e Mon Sep 17 00:00:00 2001 From: Zuleykha Pavlichenkova Date: Thu, 4 Jul 2024 09:35:43 +0200 Subject: [PATCH 3/5] updated again --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 7b2cdc7..f108981 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 7b2cdc786bf64ed776941a3e4a65722941b957a6 +Subproject commit f1089810dfff9f560595a7662be3d1d8022bf665 From 4bc4c213717851157642d18488eb2347fc8d378d Mon Sep 17 00:00:00 2001 From: Zuleykha Pavlichenkova Date: Thu, 4 Jul 2024 17:21:14 +0200 Subject: [PATCH 4/5] duckdb submodule update --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 71941d0..5be7060 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 71941d0417284dede04f623f45c1552f56515f4f +Subproject commit 5be70607225453016b36361f56c832517b1cdb8a From 13727e4b14692aa7002df3a4218b252f1b5e21b0 Mon Sep 17 00:00:00 2001 From: Zuleykha Pavlichenkova Date: Wed, 10 Jul 2024 13:18:26 +0200 Subject: [PATCH 5/5] Change MainDistributionPipeline.yml file to use shared credentials --- .../workflows/MainDistributionPipeline.yml | 10 +- .github/workflows/_extension_deploy.yml | 121 ------------------ 2 files changed, 6 insertions(+), 125 deletions(-) delete mode 100644 .github/workflows/_extension_deploy.yml diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index a88fc9c..1e26fae 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -16,15 +16,17 @@ jobs: name: Build extension binaries uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.0.0 with: - duckdb_version: main + duckdb_version: v1.0.0 extension_name: sqlsmith + exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools' duckdb-stable-deploy: name: Deploy extension binaries needs: duckdb-stable-build - uses: ./.github/workflows/_extension_deploy.yml + uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v1.0.0 secrets: inherit with: - duckdb_version: main + duckdb_version: v1.0.0 extension_name: sqlsmith - deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} + exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools' + deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} \ No newline at end of file diff --git a/.github/workflows/_extension_deploy.yml b/.github/workflows/_extension_deploy.yml deleted file mode 100644 index c408f90..0000000 --- a/.github/workflows/_extension_deploy.yml +++ /dev/null @@ -1,121 +0,0 @@ -# -# Reusable workflow that deploys the artifacts produced by github.com/duckdb/duckdb/.github/workflows/_extension_distribution.yml -# -# note: this workflow needs to be located in the extension repository, as it requires secrets to be passed to the -# deploy script. However, it should generally not be necessary to modify this workflow in your extension repository, as -# this workflow can be configured to use a custom deploy script. - - -name: Extension Deployment -on: - workflow_call: - inputs: - # The name of the extension - extension_name: - required: true - type: string - # DuckDB version to build against - duckdb_version: - required: true - type: string - # ';' separated list of architectures to exclude, for example: 'linux_amd64;osx_arm64' - exclude_archs: - required: false - type: string - default: "" - # Whether to upload this deployment as the latest. This may overwrite a previous deployment. - deploy_latest: - required: false - type: boolean - default: false - # Whether to upload this deployment under a versioned path. These will not be deleted automatically - deploy_versioned: - required: false - type: boolean - default: false - # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times - artifact_postfix: - required: false - type: string - default: "" - # Override the default deploy script with a custom script - deploy_script: - required: false - type: string - default: "./scripts/extension-upload.sh" - # Override the default matrix parse script with a custom script - matrix_parse_script: - required: false - type: string - default: "./duckdb/scripts/modify_distribution_matrix.py" - -jobs: - generate_matrix: - name: Generate matrix - runs-on: ubuntu-latest - outputs: - deploy_matrix: ${{ steps.parse-matrices.outputs.deploy_matrix }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'true' - - - name: Checkout DuckDB to version - run: | - cd duckdb - git checkout ${{ inputs.duckdb_version }} - - - id: parse-matrices - run: | - python3 ${{ inputs.matrix_parse_script }} --input ./duckdb/.github/config/distribution_matrix.json --deploy_matrix --output deploy_matrix.json --exclude "${{ inputs.exclude_archs }}" --pretty - deploy_matrix="`cat deploy_matrix.json`" - echo deploy_matrix=$deploy_matrix >> $GITHUB_OUTPUT - echo `cat $GITHUB_OUTPUT` - - deploy: - name: Deploy - runs-on: ubuntu-latest - needs: generate_matrix - if: ${{ needs.generate_matrix.outputs.deploy_matrix != '{}' && needs.generate_matrix.outputs.deploy_matrix != '' }} - strategy: - matrix: ${{fromJson(needs.generate_matrix.outputs.deploy_matrix)}} - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'true' - - - name: Checkout DuckDB to version - run: | - cd duckdb - git checkout ${{ inputs.duckdb_version }} - - - uses: actions/download-artifact@v2 - with: - name: ${{ inputs.extension_name }}-${{ inputs.duckdb_version }}-extension-${{matrix.duckdb_arch}}${{inputs.artifact_postfix}}${{startsWith(matrix.duckdb, 'wasm') && '.wasm' || ''}} - path: | - /tmp/extension - - - name: Deploy - shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }} - BUCKET_NAME: ${{ secrets.S3_BUCKET }} - DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.S3_DUCKDB_ORG_EXTENSION_SIGNING_PK }} - run: | - pwd - python3 -m pip install pip awscli - git config --global --add safe.directory '*' - cd duckdb - git fetch --tags - export DUCKDB_VERSION=`git tag --points-at HEAD` - export DUCKDB_VERSION=${DUCKDB_VERSION:=`git log -1 --format=%h`} - cd .. - git fetch --tags - export EXT_VERSION=`git tag --points-at HEAD` - export EXT_VERSION=${EXT_VERSION:=`git log -1 --format=%h`} - ${{ inputs.deploy_script }} ${{ inputs.extension_name }} $EXT_VERSION $DUCKDB_VERSION ${{ matrix.duckdb_arch }} $BUCKET_NAME ${{inputs.deploy_latest || 'true' && 'false'}} ${{inputs.deploy_versioned || 'true' && 'false'}}