parent
86637c3615
commit
c6cfcd41fe
@ -0,0 +1,302 @@
|
||||
#!/bin/python3
|
||||
# Author: phga <phga@posteo.de>
|
||||
# Date: 2022-02-08
|
||||
# Desc: Generate data that can later be used in excel2sql.py. Meant to be one file
|
||||
import json
|
||||
from random import randint
|
||||
from argparse import ArgumentParser
|
||||
from datetime import datetime as dt, timedelta
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("-c", "--count", dest="count", type=int, required=False,
|
||||
help="Number of rows to generate")
|
||||
parser.add_argument("-i", "--in", dest="in_file", required=False,
|
||||
help="Json file to read table spec from")
|
||||
parser.add_argument("-o", "--out", dest="out_file", required=False,
|
||||
help="File to print sql statements to. If not provided use stdout")
|
||||
parser.add_argument("-s", "--show-spec", dest="spec", action="store_true", required=False,
|
||||
help="Show information about the table spec file")
|
||||
try: args = parser.parse_args()
|
||||
except: exit(1)
|
||||
|
||||
if not args.spec and (not args.count or not args.in_file):
|
||||
parser.print_help()
|
||||
exit(1)
|
||||
|
||||
class ColumnFactory:
|
||||
def create(c_name: str, c_type: str, data_generation: str,
|
||||
data_source: list[str]):
|
||||
if c_type == "int":
|
||||
return IntColumn(c_name, c_type, data_generation, data_source)
|
||||
elif c_type == "str":
|
||||
return StrColumn(c_name, c_type, data_generation, data_source)
|
||||
elif c_type == "date":
|
||||
return DateColumn(c_name, c_type, data_generation, data_source)
|
||||
elif c_type == "time":
|
||||
return TimeColumn(c_name, c_type, data_generation, data_source)
|
||||
|
||||
|
||||
def get_types() -> list[str]:
|
||||
return ["int", "str", "date", "time"]
|
||||
|
||||
|
||||
def describe(c_type: str):
|
||||
if c_type == "int":
|
||||
spec_dg = ["random", "loop", "auto_increment"]
|
||||
spec_ds = '[start, end] -> e.g. [10, 20] (boundaries included)\n'
|
||||
spec_ds+= 'For now, one has to always provide start AND end, even with AI'
|
||||
elif c_type == "str":
|
||||
spec_dg = ["random", "loop"]
|
||||
spec_ds = '[str, ...] -> e.g. ["Hans", "Berlin", "Tina"]'
|
||||
elif c_type == "date":
|
||||
spec_dg = ["random", "loop"]
|
||||
spec_ds = '[startdate, enddate] -> e.g. ["2020-01-01", "2020-10-28"]'
|
||||
elif c_type == "time":
|
||||
spec_dg = ["random", "loop"]
|
||||
spec_ds = '[starttime, endtime, step] -> '
|
||||
spec_ds+= 'e.g. ["10:10:10.001", "21:11:11.002", "20 m"]\n'
|
||||
spec_ds+= 'step: [1,Inf] {s,m,h} -> e.g. "10 m", "33 s" or "2 h"'
|
||||
|
||||
spec = f'{" " + c_type.upper() + " ":=^45}\n'
|
||||
spec+= 'col_name: str -> e.g. "Meine_tolle_Spalte"\n'
|
||||
spec+= f'col_type: {c_type}\n'
|
||||
spec+= f'data_generation: {", ".join(spec_dg)}\n'
|
||||
spec+= f'data_source: {spec_ds}\n\n'
|
||||
|
||||
return spec
|
||||
|
||||
#### Print intput file specification -> Yes multiple files would be great
|
||||
|
||||
if args.spec:
|
||||
si = 'Information about the table spec file:\n'
|
||||
si+= 'Tables can be described via a json file. '
|
||||
si+= 'A minimal example with 2 columns is show here:\n'
|
||||
si+= '''
|
||||
[
|
||||
{
|
||||
"col_name": "Checkout_Time",
|
||||
"col_type": "time",
|
||||
"data_generation": "loop",
|
||||
"data_source": [
|
||||
"10:10:10.001",
|
||||
"21:11:11.002",
|
||||
"20 s"
|
||||
]
|
||||
},
|
||||
{
|
||||
"col_name": "Order_Amount",
|
||||
"col_type": "int",
|
||||
"data_generation": "random",
|
||||
"data_source": [
|
||||
10,
|
||||
20
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
'''
|
||||
si += "Description for available column types:\n\n"
|
||||
for ct in ColumnFactory.get_types():
|
||||
si += ColumnFactory.describe(ct)
|
||||
|
||||
print(si)
|
||||
exit(0)
|
||||
|
||||
#### Classes
|
||||
|
||||
class Column:
|
||||
def __init__(self, c_name: str, c_type: str, data_generation: str,
|
||||
data_source: list[str]) -> None:
|
||||
self.c_name = c_name
|
||||
self.c_type = c_type
|
||||
self.data_generation = data_generation
|
||||
self.data_source = data_source
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.c_name}[{self.c_type}, {self.data_generation}]: {self.data_source}'
|
||||
|
||||
|
||||
def generate(self) -> list[str]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class IntColumn(Column):
|
||||
def __init__(self, c_name: str, c_type: str, data_generation: str,
|
||||
data_source: list[str]):
|
||||
super().__init__(c_name, c_type, data_generation, data_source)
|
||||
# TODO: autogenerate end if not provided for some types
|
||||
if not type(data_source) is list or len(data_source) < 2:
|
||||
raise TypeError
|
||||
self.last_int = self.data_source[0]
|
||||
|
||||
|
||||
def generate(self) -> int:
|
||||
res = 0
|
||||
if self.data_generation == "random":
|
||||
res = randint(self.data_source[0], self.data_source[1])
|
||||
|
||||
elif self.data_generation == "loop":
|
||||
res = self.last_int
|
||||
# [start, end] inclusive
|
||||
self.last_int = (self.last_int + 1) % (self.data_source[1] + 1)
|
||||
if self.last_int == 0:
|
||||
self.last_int = self.data_source[0]
|
||||
|
||||
elif self.data_generation == "auto_increment":
|
||||
res = self.last_int
|
||||
self.last_int += 1
|
||||
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return res
|
||||
|
||||
|
||||
class StrColumn(Column):
|
||||
def __init__(self, c_name: str, c_type: str, data_generation: str,
|
||||
data_source: list[str]):
|
||||
super().__init__(c_name, c_type, data_generation, data_source)
|
||||
if not type(data_source) is list or len(data_source) < 1:
|
||||
raise TypeError
|
||||
# MAYB: random loop -> First index is random
|
||||
self.last_index = 0
|
||||
|
||||
|
||||
def generate(self) -> str:
|
||||
res = "NULL"
|
||||
l = len(self.data_source)
|
||||
if self.data_generation == "random":
|
||||
res = self.data_source[randint(0, l - 1)]
|
||||
|
||||
elif self.data_generation == "loop":
|
||||
res = self.data_source[self.last_index % l]
|
||||
self.last_index += 1
|
||||
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return res
|
||||
|
||||
|
||||
class DateColumn(Column):
|
||||
def __init__(self, c_name: str, c_type: str, data_generation: str,
|
||||
data_source: list[str]):
|
||||
super().__init__(c_name, c_type, data_generation, data_source)
|
||||
if not type(data_source) is list or len(data_source) < 2:
|
||||
raise TypeError
|
||||
# TODO: mayb add the ability to process multiple date ranges
|
||||
self.last_index = 0
|
||||
self.date_format = "%Y-%m-%d"
|
||||
self.start_date = dt.strptime(self.data_source[0], self.date_format)
|
||||
self.end_date = dt.strptime(self.data_source[1], self.date_format)
|
||||
self.date_diff = (self.end_date - self.start_date).days
|
||||
|
||||
|
||||
def generate(self) -> str:
|
||||
res = "Not a date"
|
||||
if self.data_generation == "random":
|
||||
delta = randint(0, self.date_diff)
|
||||
res = self.start_date + timedelta(delta)
|
||||
|
||||
elif self.data_generation == "loop":
|
||||
delta = self.last_index
|
||||
res = self.start_date + timedelta(delta)
|
||||
self.last_index = (self.last_index + 1) % (self.date_diff + 1)
|
||||
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return res.strftime(self.date_format)
|
||||
|
||||
|
||||
class TimeColumn(Column):
|
||||
def __init__(self, c_name: str, c_type: str, data_generation: str,
|
||||
data_source: list[str]):
|
||||
super().__init__(c_name, c_type, data_generation, data_source)
|
||||
if not type(data_source) is list or len(data_source) < 2:
|
||||
raise TypeError
|
||||
# TODO: mayb add the ability to process multiple date ranges
|
||||
self.last_index = 0
|
||||
# SAP HANA: 11:00:00.001
|
||||
self.time_format = "%H:%M:%S.%f"
|
||||
self.start_time = dt.strptime(self.data_source[0], self.time_format)
|
||||
self.end_time = dt.strptime(self.data_source[1], self.time_format)
|
||||
|
||||
# 5 m, 10 s
|
||||
self.time_step = int(self.data_source[2].split(" ")[0])
|
||||
self.time_mult = self.data_source[2].split(" ")[1]
|
||||
if self.time_mult == "m": self.time_mult = 60
|
||||
elif self.time_mult == "h": self.time_mult = 60 * 60
|
||||
else: self.time_mult = 1
|
||||
|
||||
self.time_diff = (self.end_time - self.start_time).seconds // self.time_mult
|
||||
|
||||
|
||||
def generate(self) -> str:
|
||||
res = "Not a time"
|
||||
if self.data_generation == "random":
|
||||
delta = randint(0, self.time_diff)
|
||||
res = self.start_time + timedelta(seconds=delta)
|
||||
|
||||
elif self.data_generation == "loop":
|
||||
delta = self.last_index
|
||||
res = self.start_time + timedelta(seconds=delta * self.time_mult)
|
||||
# TODO: mayb reset the timer to the start time instead of beginning
|
||||
# a new cycle, but this is also kinda cool -> option?
|
||||
self.last_index = (self.last_index + self.time_step) % (self.time_diff + 1)
|
||||
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return res.strftime(self.time_format)
|
||||
pass
|
||||
|
||||
#### Logic
|
||||
|
||||
# Read specification from file
|
||||
with open(args.in_file, "r") as in_file:
|
||||
try:
|
||||
table_spec = json.load(in_file)
|
||||
except:
|
||||
print("Could not read json table spec.")
|
||||
print("Are you sure you provided the correct file?")
|
||||
exit(69)
|
||||
|
||||
cols: list[Column] = []
|
||||
for ts in table_spec:
|
||||
cols.append(ColumnFactory.create(ts["col_name"], ts["col_type"],
|
||||
ts["data_generation"], ts["data_source"]))
|
||||
|
||||
rows = []
|
||||
rows.append([]) # CSV headers
|
||||
for c in cols:
|
||||
rows[0].append(c.c_name)
|
||||
|
||||
for i in range(1, args.count + 1):
|
||||
for j in range(len(cols)):
|
||||
if j == 0: rows.append([])
|
||||
# Convert to str -> csv output and usage of join
|
||||
rows[i].append(str(cols[j].generate()))
|
||||
|
||||
|
||||
if args.out_file:
|
||||
with open(args.out_file, "w") as of:
|
||||
for r in rows:
|
||||
of.write(f'{",".join(r)}\n')
|
||||
|
||||
else:
|
||||
for r in rows:
|
||||
print(",".join(r))
|
||||
|
||||
|
||||
|
||||
#### Tests
|
||||
# ic = IntColumn("test", "int", "loop", [10, 15])
|
||||
# sc = StrColumn("test", "str", "loop", ["Hans", "Wurst", "mag", "Züge"])
|
||||
# dc = DateColumn("test", "date", "random", ["2020-01-01", "2020-01-10"])
|
||||
# tc = TimeColumn("test", "time", "loop", ["10:10:10.001", "11:11:11.002", "5 m"])
|
||||
# print(ic)
|
||||
# print(sc)
|
||||
# print(dc)
|
||||
# print(tc)
|
@ -0,0 +1,61 @@
|
||||
[
|
||||
{
|
||||
"col_name": "ID",
|
||||
"col_type": "int",
|
||||
"data_generation": "auto_increment",
|
||||
"data_source": [
|
||||
10,
|
||||
80
|
||||
]
|
||||
},
|
||||
{
|
||||
"col_name": "Ein_Begriff",
|
||||
"col_type": "str",
|
||||
"data_generation": "loop",
|
||||
"data_source": [
|
||||
"Berlin",
|
||||
"Angela Merkel",
|
||||
"Olaf Scholz",
|
||||
"München",
|
||||
"Protest",
|
||||
"Corona"
|
||||
]
|
||||
},
|
||||
{
|
||||
"col_name": "Datum",
|
||||
"col_type": "date",
|
||||
"data_generation": "random",
|
||||
"data_source": [
|
||||
"2020-01-01",
|
||||
"2020-02-01"
|
||||
]
|
||||
},
|
||||
{
|
||||
"col_name": "Zeit",
|
||||
"col_type": "time",
|
||||
"data_generation": "loop",
|
||||
"data_source": [
|
||||
"10:10:10.001",
|
||||
"21:11:11.002",
|
||||
"20 s"
|
||||
]
|
||||
},
|
||||
{
|
||||
"col_name": "Anzahl_von_Etwas",
|
||||
"col_type": "int",
|
||||
"data_generation": "random",
|
||||
"data_source": [
|
||||
10,
|
||||
20
|
||||
]
|
||||
},
|
||||
{
|
||||
"col_name": "Andere_Nummer",
|
||||
"col_type": "int",
|
||||
"data_generation": "loop",
|
||||
"data_source": [
|
||||
10,
|
||||
15
|
||||
]
|
||||
}
|
||||
]
|
Loading…
Reference in new issue