From c6cfcd41fe25ad85420a5f02bbbc958dce3e0006 Mon Sep 17 00:00:00 2001 From: qhga Date: Tue, 8 Feb 2022 18:43:14 +0100 Subject: [PATCH] feat: data generation --- melli/generate_data.py | 302 +++++++++++++++++++++++++++++++++++++++++ melli/table_spec.json | 61 +++++++++ 2 files changed, 363 insertions(+) create mode 100755 melli/generate_data.py create mode 100644 melli/table_spec.json diff --git a/melli/generate_data.py b/melli/generate_data.py new file mode 100755 index 0000000..1a181f2 --- /dev/null +++ b/melli/generate_data.py @@ -0,0 +1,302 @@ +#!/bin/python3 +# Author: phga +# Date: 2022-02-08 +# Desc: Generate data that can later be used in excel2sql.py. Meant to be one file +import json +from random import randint +from argparse import ArgumentParser +from datetime import datetime as dt, timedelta + +parser = ArgumentParser() +parser.add_argument("-c", "--count", dest="count", type=int, required=False, + help="Number of rows to generate") +parser.add_argument("-i", "--in", dest="in_file", required=False, + help="Json file to read table spec from") +parser.add_argument("-o", "--out", dest="out_file", required=False, + help="File to print sql statements to. If not provided use stdout") +parser.add_argument("-s", "--show-spec", dest="spec", action="store_true", required=False, + help="Show information about the table spec file") +try: args = parser.parse_args() +except: exit(1) + +if not args.spec and (not args.count or not args.in_file): + parser.print_help() + exit(1) + +class ColumnFactory: + def create(c_name: str, c_type: str, data_generation: str, + data_source: list[str]): + if c_type == "int": + return IntColumn(c_name, c_type, data_generation, data_source) + elif c_type == "str": + return StrColumn(c_name, c_type, data_generation, data_source) + elif c_type == "date": + return DateColumn(c_name, c_type, data_generation, data_source) + elif c_type == "time": + return TimeColumn(c_name, c_type, data_generation, data_source) + + + def get_types() -> list[str]: + return ["int", "str", "date", "time"] + + + def describe(c_type: str): + if c_type == "int": + spec_dg = ["random", "loop", "auto_increment"] + spec_ds = '[start, end] -> e.g. [10, 20] (boundaries included)\n' + spec_ds+= 'For now, one has to always provide start AND end, even with AI' + elif c_type == "str": + spec_dg = ["random", "loop"] + spec_ds = '[str, ...] -> e.g. ["Hans", "Berlin", "Tina"]' + elif c_type == "date": + spec_dg = ["random", "loop"] + spec_ds = '[startdate, enddate] -> e.g. ["2020-01-01", "2020-10-28"]' + elif c_type == "time": + spec_dg = ["random", "loop"] + spec_ds = '[starttime, endtime, step] -> ' + spec_ds+= 'e.g. ["10:10:10.001", "21:11:11.002", "20 m"]\n' + spec_ds+= 'step: [1,Inf] {s,m,h} -> e.g. "10 m", "33 s" or "2 h"' + + spec = f'{" " + c_type.upper() + " ":=^45}\n' + spec+= 'col_name: str -> e.g. "Meine_tolle_Spalte"\n' + spec+= f'col_type: {c_type}\n' + spec+= f'data_generation: {", ".join(spec_dg)}\n' + spec+= f'data_source: {spec_ds}\n\n' + + return spec + +#### Print intput file specification -> Yes multiple files would be great + +if args.spec: + si = 'Information about the table spec file:\n' + si+= 'Tables can be described via a json file. ' + si+= 'A minimal example with 2 columns is show here:\n' + si+= ''' +[ + { + "col_name": "Checkout_Time", + "col_type": "time", + "data_generation": "loop", + "data_source": [ + "10:10:10.001", + "21:11:11.002", + "20 s" + ] + }, + { + "col_name": "Order_Amount", + "col_type": "int", + "data_generation": "random", + "data_source": [ + 10, + 20 + ] + } +] + +''' + si += "Description for available column types:\n\n" + for ct in ColumnFactory.get_types(): + si += ColumnFactory.describe(ct) + + print(si) + exit(0) + +#### Classes + +class Column: + def __init__(self, c_name: str, c_type: str, data_generation: str, + data_source: list[str]) -> None: + self.c_name = c_name + self.c_type = c_type + self.data_generation = data_generation + self.data_source = data_source + + + def __repr__(self): + return f'{self.c_name}[{self.c_type}, {self.data_generation}]: {self.data_source}' + + + def generate(self) -> list[str]: + raise NotImplementedError() + + +class IntColumn(Column): + def __init__(self, c_name: str, c_type: str, data_generation: str, + data_source: list[str]): + super().__init__(c_name, c_type, data_generation, data_source) + # TODO: autogenerate end if not provided for some types + if not type(data_source) is list or len(data_source) < 2: + raise TypeError + self.last_int = self.data_source[0] + + + def generate(self) -> int: + res = 0 + if self.data_generation == "random": + res = randint(self.data_source[0], self.data_source[1]) + + elif self.data_generation == "loop": + res = self.last_int + # [start, end] inclusive + self.last_int = (self.last_int + 1) % (self.data_source[1] + 1) + if self.last_int == 0: + self.last_int = self.data_source[0] + + elif self.data_generation == "auto_increment": + res = self.last_int + self.last_int += 1 + + else: + raise ValueError() + + return res + + +class StrColumn(Column): + def __init__(self, c_name: str, c_type: str, data_generation: str, + data_source: list[str]): + super().__init__(c_name, c_type, data_generation, data_source) + if not type(data_source) is list or len(data_source) < 1: + raise TypeError + # MAYB: random loop -> First index is random + self.last_index = 0 + + + def generate(self) -> str: + res = "NULL" + l = len(self.data_source) + if self.data_generation == "random": + res = self.data_source[randint(0, l - 1)] + + elif self.data_generation == "loop": + res = self.data_source[self.last_index % l] + self.last_index += 1 + + else: + raise ValueError() + + return res + + +class DateColumn(Column): + def __init__(self, c_name: str, c_type: str, data_generation: str, + data_source: list[str]): + super().__init__(c_name, c_type, data_generation, data_source) + if not type(data_source) is list or len(data_source) < 2: + raise TypeError + # TODO: mayb add the ability to process multiple date ranges + self.last_index = 0 + self.date_format = "%Y-%m-%d" + self.start_date = dt.strptime(self.data_source[0], self.date_format) + self.end_date = dt.strptime(self.data_source[1], self.date_format) + self.date_diff = (self.end_date - self.start_date).days + + + def generate(self) -> str: + res = "Not a date" + if self.data_generation == "random": + delta = randint(0, self.date_diff) + res = self.start_date + timedelta(delta) + + elif self.data_generation == "loop": + delta = self.last_index + res = self.start_date + timedelta(delta) + self.last_index = (self.last_index + 1) % (self.date_diff + 1) + + else: + raise ValueError() + + return res.strftime(self.date_format) + + +class TimeColumn(Column): + def __init__(self, c_name: str, c_type: str, data_generation: str, + data_source: list[str]): + super().__init__(c_name, c_type, data_generation, data_source) + if not type(data_source) is list or len(data_source) < 2: + raise TypeError + # TODO: mayb add the ability to process multiple date ranges + self.last_index = 0 + # SAP HANA: 11:00:00.001 + self.time_format = "%H:%M:%S.%f" + self.start_time = dt.strptime(self.data_source[0], self.time_format) + self.end_time = dt.strptime(self.data_source[1], self.time_format) + + # 5 m, 10 s + self.time_step = int(self.data_source[2].split(" ")[0]) + self.time_mult = self.data_source[2].split(" ")[1] + if self.time_mult == "m": self.time_mult = 60 + elif self.time_mult == "h": self.time_mult = 60 * 60 + else: self.time_mult = 1 + + self.time_diff = (self.end_time - self.start_time).seconds // self.time_mult + + + def generate(self) -> str: + res = "Not a time" + if self.data_generation == "random": + delta = randint(0, self.time_diff) + res = self.start_time + timedelta(seconds=delta) + + elif self.data_generation == "loop": + delta = self.last_index + res = self.start_time + timedelta(seconds=delta * self.time_mult) + # TODO: mayb reset the timer to the start time instead of beginning + # a new cycle, but this is also kinda cool -> option? + self.last_index = (self.last_index + self.time_step) % (self.time_diff + 1) + + else: + raise ValueError() + + return res.strftime(self.time_format) + pass + +#### Logic + +# Read specification from file +with open(args.in_file, "r") as in_file: + try: + table_spec = json.load(in_file) + except: + print("Could not read json table spec.") + print("Are you sure you provided the correct file?") + exit(69) + +cols: list[Column] = [] +for ts in table_spec: + cols.append(ColumnFactory.create(ts["col_name"], ts["col_type"], + ts["data_generation"], ts["data_source"])) + +rows = [] +rows.append([]) # CSV headers +for c in cols: + rows[0].append(c.c_name) + +for i in range(1, args.count + 1): + for j in range(len(cols)): + if j == 0: rows.append([]) + # Convert to str -> csv output and usage of join + rows[i].append(str(cols[j].generate())) + + +if args.out_file: + with open(args.out_file, "w") as of: + for r in rows: + of.write(f'{",".join(r)}\n') + +else: + for r in rows: + print(",".join(r)) + + + +#### Tests +# ic = IntColumn("test", "int", "loop", [10, 15]) +# sc = StrColumn("test", "str", "loop", ["Hans", "Wurst", "mag", "Züge"]) +# dc = DateColumn("test", "date", "random", ["2020-01-01", "2020-01-10"]) +# tc = TimeColumn("test", "time", "loop", ["10:10:10.001", "11:11:11.002", "5 m"]) +# print(ic) +# print(sc) +# print(dc) +# print(tc) \ No newline at end of file diff --git a/melli/table_spec.json b/melli/table_spec.json new file mode 100644 index 0000000..4373d1b --- /dev/null +++ b/melli/table_spec.json @@ -0,0 +1,61 @@ +[ + { + "col_name": "ID", + "col_type": "int", + "data_generation": "auto_increment", + "data_source": [ + 10, + 80 + ] + }, + { + "col_name": "Ein_Begriff", + "col_type": "str", + "data_generation": "loop", + "data_source": [ + "Berlin", + "Angela Merkel", + "Olaf Scholz", + "München", + "Protest", + "Corona" + ] + }, + { + "col_name": "Datum", + "col_type": "date", + "data_generation": "random", + "data_source": [ + "2020-01-01", + "2020-02-01" + ] + }, + { + "col_name": "Zeit", + "col_type": "time", + "data_generation": "loop", + "data_source": [ + "10:10:10.001", + "21:11:11.002", + "20 s" + ] + }, + { + "col_name": "Anzahl_von_Etwas", + "col_type": "int", + "data_generation": "random", + "data_source": [ + 10, + 20 + ] + }, + { + "col_name": "Andere_Nummer", + "col_type": "int", + "data_generation": "loop", + "data_source": [ + 10, + 15 + ] + } +]