feat: data generation

master
qhga 3 years ago
parent 86637c3615
commit c6cfcd41fe
Signed by: phga
GPG Key ID: 5249548AA705F019

@ -0,0 +1,302 @@
#!/bin/python3
# Author: phga <phga@posteo.de>
# Date: 2022-02-08
# Desc: Generate data that can later be used in excel2sql.py. Meant to be one file
import json
from random import randint
from argparse import ArgumentParser
from datetime import datetime as dt, timedelta
parser = ArgumentParser()
parser.add_argument("-c", "--count", dest="count", type=int, required=False,
help="Number of rows to generate")
parser.add_argument("-i", "--in", dest="in_file", required=False,
help="Json file to read table spec from")
parser.add_argument("-o", "--out", dest="out_file", required=False,
help="File to print sql statements to. If not provided use stdout")
parser.add_argument("-s", "--show-spec", dest="spec", action="store_true", required=False,
help="Show information about the table spec file")
try: args = parser.parse_args()
except: exit(1)
if not args.spec and (not args.count or not args.in_file):
parser.print_help()
exit(1)
class ColumnFactory:
def create(c_name: str, c_type: str, data_generation: str,
data_source: list[str]):
if c_type == "int":
return IntColumn(c_name, c_type, data_generation, data_source)
elif c_type == "str":
return StrColumn(c_name, c_type, data_generation, data_source)
elif c_type == "date":
return DateColumn(c_name, c_type, data_generation, data_source)
elif c_type == "time":
return TimeColumn(c_name, c_type, data_generation, data_source)
def get_types() -> list[str]:
return ["int", "str", "date", "time"]
def describe(c_type: str):
if c_type == "int":
spec_dg = ["random", "loop", "auto_increment"]
spec_ds = '[start, end] -> e.g. [10, 20] (boundaries included)\n'
spec_ds+= 'For now, one has to always provide start AND end, even with AI'
elif c_type == "str":
spec_dg = ["random", "loop"]
spec_ds = '[str, ...] -> e.g. ["Hans", "Berlin", "Tina"]'
elif c_type == "date":
spec_dg = ["random", "loop"]
spec_ds = '[startdate, enddate] -> e.g. ["2020-01-01", "2020-10-28"]'
elif c_type == "time":
spec_dg = ["random", "loop"]
spec_ds = '[starttime, endtime, step] -> '
spec_ds+= 'e.g. ["10:10:10.001", "21:11:11.002", "20 m"]\n'
spec_ds+= 'step: [1,Inf] {s,m,h} -> e.g. "10 m", "33 s" or "2 h"'
spec = f'{" " + c_type.upper() + " ":=^45}\n'
spec+= 'col_name: str -> e.g. "Meine_tolle_Spalte"\n'
spec+= f'col_type: {c_type}\n'
spec+= f'data_generation: {", ".join(spec_dg)}\n'
spec+= f'data_source: {spec_ds}\n\n'
return spec
#### Print intput file specification -> Yes multiple files would be great
if args.spec:
si = 'Information about the table spec file:\n'
si+= 'Tables can be described via a json file. '
si+= 'A minimal example with 2 columns is show here:\n'
si+= '''
[
{
"col_name": "Checkout_Time",
"col_type": "time",
"data_generation": "loop",
"data_source": [
"10:10:10.001",
"21:11:11.002",
"20 s"
]
},
{
"col_name": "Order_Amount",
"col_type": "int",
"data_generation": "random",
"data_source": [
10,
20
]
}
]
'''
si += "Description for available column types:\n\n"
for ct in ColumnFactory.get_types():
si += ColumnFactory.describe(ct)
print(si)
exit(0)
#### Classes
class Column:
def __init__(self, c_name: str, c_type: str, data_generation: str,
data_source: list[str]) -> None:
self.c_name = c_name
self.c_type = c_type
self.data_generation = data_generation
self.data_source = data_source
def __repr__(self):
return f'{self.c_name}[{self.c_type}, {self.data_generation}]: {self.data_source}'
def generate(self) -> list[str]:
raise NotImplementedError()
class IntColumn(Column):
def __init__(self, c_name: str, c_type: str, data_generation: str,
data_source: list[str]):
super().__init__(c_name, c_type, data_generation, data_source)
# TODO: autogenerate end if not provided for some types
if not type(data_source) is list or len(data_source) < 2:
raise TypeError
self.last_int = self.data_source[0]
def generate(self) -> int:
res = 0
if self.data_generation == "random":
res = randint(self.data_source[0], self.data_source[1])
elif self.data_generation == "loop":
res = self.last_int
# [start, end] inclusive
self.last_int = (self.last_int + 1) % (self.data_source[1] + 1)
if self.last_int == 0:
self.last_int = self.data_source[0]
elif self.data_generation == "auto_increment":
res = self.last_int
self.last_int += 1
else:
raise ValueError()
return res
class StrColumn(Column):
def __init__(self, c_name: str, c_type: str, data_generation: str,
data_source: list[str]):
super().__init__(c_name, c_type, data_generation, data_source)
if not type(data_source) is list or len(data_source) < 1:
raise TypeError
# MAYB: random loop -> First index is random
self.last_index = 0
def generate(self) -> str:
res = "NULL"
l = len(self.data_source)
if self.data_generation == "random":
res = self.data_source[randint(0, l - 1)]
elif self.data_generation == "loop":
res = self.data_source[self.last_index % l]
self.last_index += 1
else:
raise ValueError()
return res
class DateColumn(Column):
def __init__(self, c_name: str, c_type: str, data_generation: str,
data_source: list[str]):
super().__init__(c_name, c_type, data_generation, data_source)
if not type(data_source) is list or len(data_source) < 2:
raise TypeError
# TODO: mayb add the ability to process multiple date ranges
self.last_index = 0
self.date_format = "%Y-%m-%d"
self.start_date = dt.strptime(self.data_source[0], self.date_format)
self.end_date = dt.strptime(self.data_source[1], self.date_format)
self.date_diff = (self.end_date - self.start_date).days
def generate(self) -> str:
res = "Not a date"
if self.data_generation == "random":
delta = randint(0, self.date_diff)
res = self.start_date + timedelta(delta)
elif self.data_generation == "loop":
delta = self.last_index
res = self.start_date + timedelta(delta)
self.last_index = (self.last_index + 1) % (self.date_diff + 1)
else:
raise ValueError()
return res.strftime(self.date_format)
class TimeColumn(Column):
def __init__(self, c_name: str, c_type: str, data_generation: str,
data_source: list[str]):
super().__init__(c_name, c_type, data_generation, data_source)
if not type(data_source) is list or len(data_source) < 2:
raise TypeError
# TODO: mayb add the ability to process multiple date ranges
self.last_index = 0
# SAP HANA: 11:00:00.001
self.time_format = "%H:%M:%S.%f"
self.start_time = dt.strptime(self.data_source[0], self.time_format)
self.end_time = dt.strptime(self.data_source[1], self.time_format)
# 5 m, 10 s
self.time_step = int(self.data_source[2].split(" ")[0])
self.time_mult = self.data_source[2].split(" ")[1]
if self.time_mult == "m": self.time_mult = 60
elif self.time_mult == "h": self.time_mult = 60 * 60
else: self.time_mult = 1
self.time_diff = (self.end_time - self.start_time).seconds // self.time_mult
def generate(self) -> str:
res = "Not a time"
if self.data_generation == "random":
delta = randint(0, self.time_diff)
res = self.start_time + timedelta(seconds=delta)
elif self.data_generation == "loop":
delta = self.last_index
res = self.start_time + timedelta(seconds=delta * self.time_mult)
# TODO: mayb reset the timer to the start time instead of beginning
# a new cycle, but this is also kinda cool -> option?
self.last_index = (self.last_index + self.time_step) % (self.time_diff + 1)
else:
raise ValueError()
return res.strftime(self.time_format)
pass
#### Logic
# Read specification from file
with open(args.in_file, "r") as in_file:
try:
table_spec = json.load(in_file)
except:
print("Could not read json table spec.")
print("Are you sure you provided the correct file?")
exit(69)
cols: list[Column] = []
for ts in table_spec:
cols.append(ColumnFactory.create(ts["col_name"], ts["col_type"],
ts["data_generation"], ts["data_source"]))
rows = []
rows.append([]) # CSV headers
for c in cols:
rows[0].append(c.c_name)
for i in range(1, args.count + 1):
for j in range(len(cols)):
if j == 0: rows.append([])
# Convert to str -> csv output and usage of join
rows[i].append(str(cols[j].generate()))
if args.out_file:
with open(args.out_file, "w") as of:
for r in rows:
of.write(f'{",".join(r)}\n')
else:
for r in rows:
print(",".join(r))
#### Tests
# ic = IntColumn("test", "int", "loop", [10, 15])
# sc = StrColumn("test", "str", "loop", ["Hans", "Wurst", "mag", "Züge"])
# dc = DateColumn("test", "date", "random", ["2020-01-01", "2020-01-10"])
# tc = TimeColumn("test", "time", "loop", ["10:10:10.001", "11:11:11.002", "5 m"])
# print(ic)
# print(sc)
# print(dc)
# print(tc)

@ -0,0 +1,61 @@
[
{
"col_name": "ID",
"col_type": "int",
"data_generation": "auto_increment",
"data_source": [
10,
80
]
},
{
"col_name": "Ein_Begriff",
"col_type": "str",
"data_generation": "loop",
"data_source": [
"Berlin",
"Angela Merkel",
"Olaf Scholz",
"München",
"Protest",
"Corona"
]
},
{
"col_name": "Datum",
"col_type": "date",
"data_generation": "random",
"data_source": [
"2020-01-01",
"2020-02-01"
]
},
{
"col_name": "Zeit",
"col_type": "time",
"data_generation": "loop",
"data_source": [
"10:10:10.001",
"21:11:11.002",
"20 s"
]
},
{
"col_name": "Anzahl_von_Etwas",
"col_type": "int",
"data_generation": "random",
"data_source": [
10,
20
]
},
{
"col_name": "Andere_Nummer",
"col_type": "int",
"data_generation": "loop",
"data_source": [
10,
15
]
}
]
Loading…
Cancel
Save