feat: data generation

2022-02-08 18:43:14 +01:00 · 2022-02-08 18:43:14 +01:00 · c6cfcd41fe
commit c6cfcd41fe
parent 86637c3615
2 changed files with 363 additions and 0 deletions
--- a/melli/generate_data.py
+++ b/melli/generate_data.py
@ -0,0 +1,302 @@
 #!/bin/python3
 # Author: phga <phga@posteo.de>
 # Date: 2022-02-08
 # Desc: Generate data that can later be used in excel2sql.py. Meant to be one file
 import json
 from random import randint
 from argparse import ArgumentParser
 from datetime import datetime as dt, timedelta
 parser = ArgumentParser()
 parser.add_argument("-c", "--count", dest="count", type=int, required=False,
                    help="Number of rows to generate")
 parser.add_argument("-i", "--in", dest="in_file", required=False,
                    help="Json file to read table spec from")
 parser.add_argument("-o", "--out", dest="out_file", required=False,
                    help="File to print sql statements to. If not provided use stdout")
 parser.add_argument("-s", "--show-spec", dest="spec", action="store_true", required=False,
                    help="Show information about the table spec file")
 try: args = parser.parse_args()
 except: exit(1)
 if not args.spec and (not args.count or not args.in_file):
    parser.print_help()
    exit(1)
 class ColumnFactory:
    def create(c_name: str, c_type: str, data_generation: str,
               data_source: list[str]):
        if c_type == "int":
            return IntColumn(c_name, c_type, data_generation, data_source)
        elif c_type == "str":
            return StrColumn(c_name, c_type, data_generation, data_source)
        elif c_type == "date":
            return DateColumn(c_name, c_type, data_generation, data_source)
        elif c_type == "time":
            return TimeColumn(c_name, c_type, data_generation, data_source)
    def get_types() -> list[str]:
        return ["int", "str", "date", "time"]
    def describe(c_type: str):
        if c_type == "int":
            spec_dg = ["random", "loop", "auto_increment"]
            spec_ds = '[start, end] -> e.g. [10, 20] (boundaries included)\n'
            spec_ds+= 'For now, one has to always provide start AND end, even with AI'
        elif c_type == "str":
            spec_dg = ["random", "loop"]
            spec_ds = '[str, ...] -> e.g. ["Hans", "Berlin", "Tina"]'
        elif c_type == "date":
            spec_dg = ["random", "loop"]
            spec_ds = '[startdate, enddate] -> e.g. ["2020-01-01", "2020-10-28"]'
        elif c_type == "time":
            spec_dg = ["random", "loop"]
            spec_ds = '[starttime, endtime, step] -> '
            spec_ds+= 'e.g. ["10:10:10.001", "21:11:11.002", "20 m"]\n'
            spec_ds+= 'step: [1,Inf] {s,m,h} -> e.g. "10 m", "33 s" or "2 h"'
        spec = f'{" " + c_type.upper() + " ":=^45}\n'
        spec+=  'col_name: str -> e.g. "Meine_tolle_Spalte"\n'
        spec+= f'col_type: {c_type}\n'
        spec+= f'data_generation: {", ".join(spec_dg)}\n'
        spec+= f'data_source: {spec_ds}\n\n'
        return spec
 #### Print intput file specification -> Yes multiple files would be great
 if args.spec:
    si = 'Information about the table spec file:\n'
    si+= 'Tables can be described via a json file. '
    si+= 'A minimal example with 2 columns is show here:\n'
    si+= '''
 [
   {
        "col_name": "Checkout_Time",
        "col_type": "time",
        "data_generation": "loop",
        "data_source": [
            "10:10:10.001",
            "21:11:11.002",
            "20 s"
        ]
    },
    {
        "col_name": "Order_Amount",
        "col_type": "int",
        "data_generation": "random",
        "data_source": [
            10,
            20
        ]
    }
 ]
 '''
    si += "Description for available column types:\n\n"
    for ct in ColumnFactory.get_types():
        si += ColumnFactory.describe(ct)
    print(si)
    exit(0)
 #### Classes
 class Column:
    def __init__(self, c_name: str, c_type: str, data_generation: str,
                 data_source: list[str]) -> None:
        self.c_name = c_name
        self.c_type = c_type
        self.data_generation = data_generation
        self.data_source = data_source
    def __repr__(self):
        return f'{self.c_name}[{self.c_type}, {self.data_generation}]: {self.data_source}'
    def generate(self) -> list[str]:
        raise NotImplementedError()
 class IntColumn(Column):
    def __init__(self, c_name: str, c_type: str, data_generation: str,
                 data_source: list[str]):
        super().__init__(c_name, c_type, data_generation, data_source)
        # TODO: autogenerate end if not provided for some types
        if not type(data_source) is list or len(data_source) < 2:
            raise TypeError
        self.last_int = self.data_source[0]
    def generate(self) -> int:
        res = 0
        if self.data_generation == "random":
            res = randint(self.data_source[0], self.data_source[1])
        elif self.data_generation == "loop":
            res = self.last_int
            # [start, end] inclusive
            self.last_int = (self.last_int + 1) % (self.data_source[1] + 1)
            if self.last_int == 0:
                self.last_int = self.data_source[0]
        elif self.data_generation == "auto_increment":
            res = self.last_int
            self.last_int += 1
        else:
            raise ValueError()
        return res
 class StrColumn(Column):
    def __init__(self, c_name: str, c_type: str, data_generation: str,
                 data_source: list[str]):
        super().__init__(c_name, c_type, data_generation, data_source)
        if not type(data_source) is list or len(data_source) < 1:
            raise TypeError
        # MAYB: random loop -> First index is random
        self.last_index = 0
    def generate(self) -> str:
        res = "NULL"
        l = len(self.data_source)
        if self.data_generation == "random":
            res = self.data_source[randint(0, l - 1)]
        elif self.data_generation == "loop":
            res = self.data_source[self.last_index % l]
            self.last_index += 1
        else:
            raise ValueError()
        return res
 class DateColumn(Column):
    def __init__(self, c_name: str, c_type: str, data_generation: str,
                 data_source: list[str]):
        super().__init__(c_name, c_type, data_generation, data_source)
        if not type(data_source) is list or len(data_source) < 2:
            raise TypeError
        # TODO: mayb add the ability to process multiple date ranges
        self.last_index = 0
        self.date_format = "%Y-%m-%d"
        self.start_date = dt.strptime(self.data_source[0], self.date_format)
        self.end_date = dt.strptime(self.data_source[1], self.date_format)
        self.date_diff = (self.end_date - self.start_date).days
    def generate(self) -> str:
        res = "Not a date"
        if self.data_generation == "random":
            delta = randint(0, self.date_diff)
            res = self.start_date + timedelta(delta)
        elif self.data_generation == "loop":
            delta = self.last_index
            res = self.start_date + timedelta(delta)
            self.last_index = (self.last_index + 1) % (self.date_diff + 1)
        else:
            raise ValueError()
        return res.strftime(self.date_format)
 class TimeColumn(Column):
    def __init__(self, c_name: str, c_type: str, data_generation: str,
                 data_source: list[str]):
        super().__init__(c_name, c_type, data_generation, data_source)
        if not type(data_source) is list or len(data_source) < 2:
            raise TypeError
        # TODO: mayb add the ability to process multiple date ranges
        self.last_index = 0
        # SAP HANA: 11:00:00.001
        self.time_format = "%H:%M:%S.%f"
        self.start_time = dt.strptime(self.data_source[0], self.time_format)
        self.end_time = dt.strptime(self.data_source[1], self.time_format)
        # 5 m, 10 s
        self.time_step = int(self.data_source[2].split(" ")[0])
        self.time_mult = self.data_source[2].split(" ")[1]
        if self.time_mult == "m": self.time_mult = 60
        elif self.time_mult == "h": self.time_mult = 60 * 60
        else: self.time_mult = 1
        self.time_diff = (self.end_time - self.start_time).seconds // self.time_mult
    def generate(self) -> str:
        res = "Not a time"
        if self.data_generation == "random":
            delta = randint(0, self.time_diff)
            res = self.start_time + timedelta(seconds=delta)
        elif self.data_generation == "loop":
            delta = self.last_index
            res = self.start_time + timedelta(seconds=delta * self.time_mult)
            # TODO: mayb reset the timer to the start time instead of beginning
            #       a new cycle, but this is also kinda cool -> option?
            self.last_index = (self.last_index + self.time_step) % (self.time_diff + 1)
        else:
            raise ValueError()
        return res.strftime(self.time_format)
    pass
 #### Logic
 # Read specification from file
 with open(args.in_file, "r") as in_file:
    try:
        table_spec = json.load(in_file)
    except:
        print("Could not read json table spec.")
        print("Are you sure you provided the correct file?")
        exit(69)
 cols: list[Column] = []
 for ts in table_spec:
    cols.append(ColumnFactory.create(ts["col_name"], ts["col_type"],
                                     ts["data_generation"], ts["data_source"]))
 rows = []
 rows.append([]) # CSV headers
 for c in cols:
    rows[0].append(c.c_name)
 for i in range(1, args.count + 1):
    for j in range(len(cols)):
        if j == 0: rows.append([])
        # Convert to str -> csv output and usage of join
        rows[i].append(str(cols[j].generate()))
 if args.out_file:
    with open(args.out_file, "w") as of:
        for r in rows:
            of.write(f'{",".join(r)}\n')
 else:
    for r in rows:
        print(",".join(r))
 #### Tests
 # ic = IntColumn("test", "int", "loop", [10, 15])
 # sc = StrColumn("test", "str", "loop", ["Hans", "Wurst", "mag", "Züge"])
 # dc = DateColumn("test", "date", "random", ["2020-01-01", "2020-01-10"])
 # tc = TimeColumn("test", "time", "loop", ["10:10:10.001", "11:11:11.002", "5 m"])
 # print(ic)
 # print(sc)
 # print(dc)
 # print(tc)
--- a/melli/table_spec.json
+++ b/melli/table_spec.json
@ -0,0 +1,61 @@
 [
  {
    "col_name": "ID",
    "col_type": "int",
    "data_generation": "auto_increment",
    "data_source": [
      10,
      80
    ]
  },
  {
    "col_name": "Ein_Begriff",
    "col_type": "str",
    "data_generation": "loop",
    "data_source": [
      "Berlin",
      "Angela Merkel",
      "Olaf Scholz",
      "München",
      "Protest",
      "Corona"
    ]
  },
  {
    "col_name": "Datum",
    "col_type": "date",
    "data_generation": "random",
    "data_source": [
      "2020-01-01",
      "2020-02-01"
    ]
  },
  {
    "col_name": "Zeit",
    "col_type": "time",
    "data_generation": "loop",
    "data_source": [
      "10:10:10.001",
      "21:11:11.002",
      "20 s"
    ]
  },
  {
    "col_name": "Anzahl_von_Etwas",
    "col_type": "int",
    "data_generation": "random",
    "data_source": [
      10,
      20
    ]
  },
  {
    "col_name": "Andere_Nummer",
    "col_type": "int",
    "data_generation": "loop",
    "data_source": [
      10,
      15
    ]
  }
 ]