Move test data generation outside perf test and remove multiple os size calls

This commit is contained in:
Michal Szczepanski 2019-08-23 02:42:52 +02:00
parent f022b7c0dc
commit e7c0372d59
No known key found for this signature in database
GPG Key ID: 3DCEFDDFCCB1DC69
3 changed files with 57 additions and 39 deletions

@ -20,7 +20,7 @@ For simplicity data and index files is append only.
Index file is loaded and stored as dictionary in memory on start. Index file is loaded and stored as dictionary in memory on start.
#### Dependencies #### Dependencies
Depends on psutil to display pid statistics when running main file. Optional psutil to display pid statistics when running main file.
#### Run #### Run
@ -29,8 +29,8 @@ I tested it using python3.7 but it can be easily converted to any python version
By default it : By default it :
- removes 2 files test.db, test.index if those files exists - removes 2 files test.db, test.index if those files exists
- create 2 files test.db, test.index - create 2 files test.db, test.index
- writes and index 100k random string with characters between (100, 1000) to test.db file - writes and index 1 million random string with characters between (100, 1000) to test.db file (around 540MB)
- reads 100k random elements from file - reads 1 million random elements from file (around 8MB)
- performs read of object at position 2 - performs read of object at position 2
- remove object from position 3 - remove object from position 3
- update object at position 2 - update object at position 2
@ -41,12 +41,13 @@ python dummy_crud_database.py
#### Output #### Output
```bash ```bash
write elements 100000 in 70.1026759147644 Test elements size 1000000
read elements 100000 in 3.7399983406066895 write elements in 35.61809206008911s - 28075.61950013945 per second
size : 100000 read elements in 13.677339792251587s - 73113.63285472477 per second
database fsize : 53.91 MB size : 1000000
index fsize : 0.76 MB database fsize : 539.06 MB
pid memory usage : 25.61 MB index fsize : 7.63 MB
``` ```
So it looks like it can do 1,4k inserts per second and around 26k reads per second on my computer (write time is including generation of random data).
So it looks like it can do 1,4k inserts per second and around 26k reads per second on my computer (write time including random choice from 1k elements array).

@ -33,6 +33,7 @@ class BaseFile:
pathlib.Path(self.path).touch() pathlib.Path(self.path).touch()
self.access = access self.access = access
self.dbfile = None self.dbfile = None
self.size = 0
def __enter__(self): def __enter__(self):
self.open() self.open()
@ -43,16 +44,14 @@ class BaseFile:
def open(self): def open(self):
self.dbfile = open(self.path, self.access) self.dbfile = open(self.path, self.access)
if self.fsize() == 0: self.size = os.path.getsize(self.path)
if self.size == 0:
Logger.info('write initial data') Logger.info('write initial data')
self.dbfile.write(self._write_int(0)) self.dbfile.write(self._write_int(0))
self.dbfile.seek(0) self.dbfile.seek(0)
self.size += INT_SIZE
self.dbfile.flush() self.dbfile.flush()
def fsize(self):
s = os.path.getsize(self.path)
return s
def _write_int(self, i): def _write_int(self, i):
return struct.pack('<I', i) return struct.pack('<I', i)
@ -70,22 +69,22 @@ class IndexFile(BaseFile):
def write(self, i, position): def write(self, i, position):
self.idx[i] = position self.idx[i] = position
self.dbfile.seek(self.fsize()) self.dbfile.seek(self.size)
self.dbfile.write(self._write_int(i)) self.dbfile.write(self._write_int(i))
self.dbfile.write(self._write_int(position)) self.dbfile.write(self._write_int(position))
self.size += INT_SIZE*2
self.dbfile.flush() self.dbfile.flush()
def read_index(self): def read_index(self):
self.BaseFile = {} self.BaseFile = {}
index = INT_SIZE index = INT_SIZE
end = self.fsize() end = self.size
while index < end: while index < end:
i = self._read_int(self.dbfile.read(INT_SIZE)) i = self._read_int(self.dbfile.read(INT_SIZE))
position = self._read_int(self.dbfile.read(INT_SIZE)) position = self._read_int(self.dbfile.read(INT_SIZE))
index += INT_SIZE*2 index += INT_SIZE*2
self.idx[i] = position self.idx[i] = position
class CrudIndexFile(): class CrudIndexFile():
def __init__(self, dbpath='test.db', indexpath='test.index'): def __init__(self, dbpath='test.db', indexpath='test.index'):
@ -104,7 +103,7 @@ class CrudIndexFile():
def write(self, data): def write(self, data):
data, size = self._get_data(data) data, size = self._get_data(data)
end = self.base.fsize() end = self.base.size
# calculate new number of elements # calculate new number of elements
index = self._read_size()+1 index = self._read_size()+1
# go to end # go to end
@ -115,11 +114,12 @@ class CrudIndexFile():
# increase number of elements # increase number of elements
self._write_size(size=index) self._write_size(size=index)
self.idxdata.write(i=index, position=end) self.idxdata.write(i=index, position=end)
self.base.size += HEADER_SIZE + size
self.base.dbfile.flush() self.base.dbfile.flush()
def readall(self): def readall(self):
position = INT_SIZE position = INT_SIZE
end = self.fsize() end = self.base.size
output = [] output = []
while position < end: while position < end:
self.base.dbfile.seek(position) self.base.dbfile.seek(position)
@ -142,7 +142,7 @@ class CrudIndexFile():
position = self.base.dbfile.tell() position = self.base.dbfile.tell()
# got ot header and override with status updated and set skip to end of file # got ot header and override with status updated and set skip to end of file
self.base.dbfile.seek(position-HEADER_SIZE) self.base.dbfile.seek(position-HEADER_SIZE)
end = self.base.fsize() end = self.base.size
self._write_header(size=size, index=idx, status=STATUS_UPDATED, skip=end) self._write_header(size=size, index=idx, status=STATUS_UPDATED, skip=end)
# read old value # read old value
old = self.base.dbfile.read(size).decode('utf-8') old = self.base.dbfile.read(size).decode('utf-8')
@ -152,6 +152,7 @@ class CrudIndexFile():
data, size = self._get_data(data) data, size = self._get_data(data)
self._write_header(size=size, index=idx, status=STATUS_OK, skip=0) self._write_header(size=size, index=idx, status=STATUS_OK, skip=0)
self.base.dbfile.write(data) self.base.dbfile.write(data)
self.base.size += HEADER_SIZE + size
self.base.dbfile.flush() self.base.dbfile.flush()
return old return old
@ -175,7 +176,7 @@ class CrudIndexFile():
def seek_data(self, index): def seek_data(self, index):
position = self.idxdata.idx.get(index) position = self.idxdata.idx.get(index)
end = self.base.fsize() end = self.base.size
while position < end: while position < end:
self.base.dbfile.seek(position) self.base.dbfile.seek(position)
status, idx, skip, size = self._read_header() status, idx, skip, size = self._read_header()
@ -223,24 +224,34 @@ if __name__ == '__main__':
if os.path.exists(idxpath): if os.path.exists(idxpath):
os.remove(idxpath) os.remove(idxpath)
rstring = lambda size: ''.join(random.choice(string.ascii_letters) for i in range(size)) rstring = lambda size: ''.join(random.choice(string.ascii_letters) for i in range(size))
a = time.time() test_size = 1000000
write_elements = 100000 Logger.info('Test elements size {}'.format(test_size))
read_elements = 100000
with CrudIndexFile() as crud: with CrudIndexFile() as crud:
for i in range(write_elements+1): test_data = []
crud.write(rstring(random.randrange(100, 1000))) for i in range(0, 1001):
test_data.append(rstring(random.randrange(100, 1000)))
test_data_len = len(test_data)
a = time.time()
for i in range(test_size+1):
crud.write(random.choice(test_data))
if i % 10000 == 0:
Logger.info('write {}'.format(i))
size = crud.size() size = crud.size()
print("write elements {} in {}".format(write_elements, time.time() - a)) t = time.time() - a
print("write elements in {}s - {} per second".format(t, test_size/t))
b = time.time() b = time.time()
for i in range(0, read_elements+1): for i in range(0, test_size+1):
crud.read(random.randrange(1, size)) crud.read(random.randrange(1, size))
print("read elements {} in {}".format(read_elements, time.time() - b)) if i % 10000 == 0:
Logger.info('read index 2 : ', crud.read(index=2)) Logger.info('read {}'.format(i))
Logger.info('remove index 3 : ', crud.delete(index=3)) t = time.time() - b
Logger.info('update index 2 : ', crud.update(index=2, data=rstring(85))) print("read elements in {}s - {} per second".format(t, test_size/t))
Logger.info('read index {} : '.format(size), crud.read(index=size)) crud.read(index=2)
crud.delete(index=3)
crud.update(index=2, data=rstring(85))
crud.read(index=size)
Logger.info('size : ', crud.size()) Logger.info('size : ', crud.size())
Logger.info('database fsize : ', process_size.convert_size(crud.base.fsize(), 2)) Logger.info('database fsize : ', process_size.convert_size(crud.base.size, 2))
Logger.info('index fsize : ', process_size.convert_size(crud.idxdata.fsize(), 2)) Logger.info('index fsize : ', process_size.convert_size(crud.idxdata.size, 2))
Logger.info('pid : ', process_size.get_size()) Logger.info('pid : ', process_size.get_size())
Logger.info('total : {}'.format(time.time() - a)) Logger.info('total : {}'.format(time.time() - a))

@ -2,7 +2,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import math import math
import os import os
try:
import psutil import psutil
except:
pass
def convert_size(size_bytes, index=0): def convert_size(size_bytes, index=0):
if size_bytes == 0: if size_bytes == 0:
@ -15,5 +18,8 @@ def convert_size(size_bytes, index=0):
return "%s %s" % (s, size_name[index]) return "%s %s" % (s, size_name[index])
def get_size(): def get_size():
try:
process = psutil.Process(os.getpid()) process = psutil.Process(os.getpid())
return convert_size(process.memory_info().rss, 2) return convert_size(process.memory_info().rss, 2)
except:
return 0