Commit 1f41ccfc authored by Mouadh's avatar Mouadh

config_file + postgres_backend

parent de960516
<?xml version="1.0" encoding="UTF-8"?>
<cubes>
<cube>
<!-- cube name => db name -->
<name>labster</name>
<!-- source : postgres | csv -->
<source>postgres</source>
<!--
star building customized star schema
-->
<facts>
<!-- facts table name -->
<table_name>stats_line</table_name>
<keys>
<!-- ref = table_name.column -->
<column_name ref="orgunit.id">departement_id</column_name>
</keys>
<!-- specify measures explicitly -->
<measures>
<!-- by default, all number type columns in facts table, or you can specify them here -->
<name>montant</name>
<name>salaire_brut_mensuel</name>
<name>cout_total_mensuel</name>
</measures>
</facts>
<!--
end building customized star schema
-->
<!--
star building customized dimensions display in excel from the star schema
-->
<dimensions>
<dimension>
<!-- if you want to keep the same name for excel display, just use the same name in name and displayName -->
<name>stats_line</name>
<displayName>Demande</displayName>
<columns>
<!-- columns order matter -->
<name>type_demande</name>
<name>financeur</name>
<name>wf_state</name>
<name>type_recrutement</name>
</columns>
</dimension>
<dimension>
<!-- if you want to keep the same name for excel display, just use the same name in name and displayName -->
<name>orgunit</name>
<displayName>Organisation</displayName>
<columns>
<!-- columns order matter -->
<name>type</name>
<name>nom</name>
<name>sigle</name>
</columns>
</dimension>
</dimensions>
<!--
end building customized dimensions display in excel from the star schema
-->
</cube>
</cubes>
\ No newline at end of file
class ConditionError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
# -*- encoding: utf8 -*-
from __future__ import absolute_import, division, print_function
import itertools
......@@ -7,6 +9,10 @@ from collections import OrderedDict
import numpy as np
import pandas as pd
import pandas.io.sql as psql
from ..utils.connection import MyDB
from ..utils.config_file_parser import ConfigParser
class MdxEngine:
......@@ -17,14 +23,18 @@ class MdxEngine:
:param mdx_query: query to execute
"""
CUBE_FOLDER = "cubes"
FACTS_TABLE_NAME = "Facts"
# (before instantiate MdxEngine I need to access cubes information)
csv_files_cubes = []
postgres_db_cubes = []
# to show just config file's dimensions
dimension_display_name = []
def __init__(self,
cube_name,
mdx_query=None,
cube_folder=CUBE_FOLDER,
sep=';',
fact_table_name=FACTS_TABLE_NAME):
fact_table_name="Facts"):
'''
:param cube_folder: parent cube folder name
......@@ -37,12 +47,14 @@ class MdxEngine:
self.facts = fact_table_name
self.mdx_query = mdx_query
self.cube_path = self._get_cube_path()
self.load_star_schema_dataframe = self._get_star_schema_dataframe(
cube_name)
# to get cubes in db
self._ = self.get_cubes_names()
self.tables_loaded = self._load_tables()
self.tables_names = self._get_tables_name()
# all measures
self.measures = self._get_measures()
self.load_star_schema_dataframe = self._get_star_schema_dataframe(
cube_name)
self.tables_names = self._get_tables_name()
# default measure is the first one
self.selected_measures = [self.measures[0]]
......@@ -52,42 +64,130 @@ class MdxEngine:
:return: list cubes name under cubes folder
'''
# get csv files folders (cubes)
try:
location = os.path.join(
os.path.abspath(
os.path.join(
os.path.dirname(__file__), "..", "..", "..", "..")),
MdxEngine.CUBE_FOLDER)
return [
MdxEngine.csv_files_cubes = [
file for file in os.listdir(location)
if os.path.isdir(os.path.join(location, file))
]
except:
pass
# get postgres databases
try:
db = MyDB()
cursor = db.connection.cursor()
cursor.execute("""SELECT datname FROM pg_database
WHERE datistemplate = false;""")
MdxEngine.postgres_db_cubes = [
database[0] for database in cursor.fetchall()
]
except:
pass
return MdxEngine.csv_files_cubes + MdxEngine.postgres_db_cubes
def _get_tables_name(self):
return self.tables_loaded.keys()
def _get_cube_path(self):
'''
:return: return local cube folder name with full path
'''
return os.path.join(
os.path.abspath(
os.path.join(
os.path.dirname(__file__), '..', "..", '..', '..')),
self.cube_folder)
def _load_table_config_file(self, cube_obj):
"""
load tables from config file
:param cube_obj: cubes object
:return: tables dict with table name as key and dataframe as value
"""
def _load_tables(self):
tables = {}
# just one facts table right now
self.facts = cube_obj.facts[0].table_name
db = MyDB(db=self.cube)
for table in cube_obj.dimensions:
value = psql.read_sql_query("SELECT * FROM {0}".format(table.name),
db.connection)
tables[table.name] = value[[
col for col in value.columns if col.lower()[-3:] != '_id'
]]
# update table display name
for dimension in cube_obj.dimensions:
if dimension.displayName and dimension.name and dimension.displayName != dimension.name:
tables[dimension.displayName] = tables[dimension.name][
dimension.columns]
MdxEngine.dimension_display_name.append(dimension.name)
return tables
def _load_tables_csv_files(self):
"""
load all tables
:return: dict with key as table name and DataFrame as value
load tables from csv files
:return: tables dict with table name as key and dataframe as value
"""
cube = self.get_cube()
tables = {}
cube = self.get_cube()
for file in os.listdir(cube):
# to remove file extension ".csv"
table_name = os.path.splitext(file)[0]
value = pd.read_csv(os.path.join(cube, file), sep=self.sep)
tables[table_name] = value[
[col for col in value.columns if col.lower()[-3:] != '_id']]
tables[table_name] = value[[
col for col in value.columns if col.lower()[-3:] != '_id'
]]
return tables
def _load_tables_db(self):
"""
load tables from database
:return: tables dict with table name as key and dataframe as value
"""
tables = {}
db = MyDB(db=self.cube)
cursor = db.connection.cursor()
cursor.execute("""SELECT table_name FROM information_schema.tables
WHERE table_schema = 'public'""")
for table_name in cursor.fetchall():
value = psql.read_sql_query(
'SELECT * FROM "{0}" '.format(table_name[0]), db.connection)
tables[table_name[0]] = value[[
col for col in value.columns if col.lower()[-3:] != '_id'
]]
return tables
def _load_tables(self):
"""
load all tables
:return: dict with key as table name and DataFrame as value
"""
config_file_parser = ConfigParser(self.cube_path)
tables = {}
if config_file_parser.config_file_exist(
) and self.cube in config_file_parser.get_cubes_names():
for cubes in config_file_parser.construct_cubes():
# TODO cubes.source == 'csv'
if cubes.source == 'postgres':
tables = self._load_table_config_file(cubes)
elif self.cube in self.csv_files_cubes:
tables = self._load_tables_csv_files()
elif self.cube in self.postgres_db_cubes:
tables = self._load_tables_db()
return tables
def _get_measures(self):
......@@ -95,21 +195,124 @@ class MdxEngine:
:return: all numerical columns in facts table
"""
return list(self.tables_loaded[self.facts].select_dtypes(
include=[np.number]).columns)
# col.lower()[-2:] != 'id' to ignore any id column
return [
col
for col in self.tables_loaded[self.facts].select_dtypes(
include=[np.number]).columns if col.lower()[-2:] != 'id'
]
def _construct_star_schema_config_file(self, cube_name, cubes_obj):
"""
Construct star schema Dataframe from configuration file
:param cube_name: cube name (or database name)
:param cubes_obj: cubes object
:return: star schema Dataframe
"""
self.facts = cubes_obj.facts[0].table_name
db = MyDB(db=cube_name)
# load facts table
fusion = psql.read_sql_query("SELECT * FROM {0}".format(self.facts),
db.connection)
for fact_key, dimension_and_key in cubes_obj.facts[0].keys.items():
df = psql.read_sql_query(
"SELECT * FROM {0}".format(dimension_and_key.split('.')[0]),
db.connection)
fusion = fusion.merge(
df,
left_on=fact_key,
right_on=dimension_and_key.split('.')[1],
how='outer')
# TODO CHOSE BETWEEN THOSES DF
# if separated dimensions
# fusion = fusion.merge(df, left_on=fact_key,right_on=dimension_and_key.split('.')[1])
# TODO CHOSE BETWEEN THOSES DF
# if facts contains all dimensions
# fusion = facts
# measures in config-file only
if cubes_obj.facts[0].measures:
self.measures = cubes_obj.facts[0].measures
return fusion
def _construct_star_schema_csv_files(self, cube_name):
"""
Construct star schema Dataframe from csv files
:param cube_name: cube name (folder name)
:return: star schema Dataframe
"""
cube = self.get_cube()
# loading facts table
fusion = pd.read_csv(
os.path.join(cube, self.facts + '.csv'), sep=self.sep)
for file_name in os.listdir(cube):
try:
fusion = fusion.merge(
pd.read_csv(os.path.join(cube, file_name), sep=self.sep))
except:
print('No common column')
pass
return fusion
def _construct_star_schema_db(self, cube_name):
"""
Construct star schema Dataframe from database
:param cube_name: cube name (database name)
:return: star schema Dataframe
"""
db = MyDB(db=cube_name)
def _get_star_schema_dataframe(self, cube):
# load facts table
fusion = psql.read_sql_query('SELECT * FROM "{0}" '.format(self.facts),
db.connection)
cursor = db.connection.cursor()
cursor.execute("""SELECT table_name FROM information_schema.tables
WHERE table_schema = 'public'""")
for db_table_name in cursor.fetchall():
try:
fusion = fusion.merge(
psql.read_sql_query("SELECT * FROM {0}".format(
db_table_name[0]), db.connection))
except:
print('No common column')
pass
return fusion
def _get_star_schema_dataframe(self, cube_name):
'''
:return: all DataFrames merged as star schema
'''
# star schema = (http://datawarehouse4u.info/Data-warehouse-schema-architecture-star-schema.html)
cube = self.get_cube()
# loading facts table
df = pd.read_csv(os.path.join(cube, self.facts + '.csv'), sep=self.sep)
for f in os.listdir(cube):
df = df.merge(pd.read_csv(os.path.join(cube, f), sep=self.sep))
# TODO check this
return df[[col for col in df.columns if col.lower()[-3:] != '_id']]
fusion = None
config_file_parser = ConfigParser(self.cube_path)
if config_file_parser.config_file_exist(
) and cube_name in config_file_parser.get_cubes_names():
for cubes in config_file_parser.construct_cubes():
# TODO cubes.source == 'csv'
if cubes.source == 'postgres':
fusion = self._construct_star_schema_config_file(cube_name,
cubes)
elif cube_name in self.csv_files_cubes:
fusion = self._construct_star_schema_csv_files(cube_name)
elif cube_name in self.postgres_db_cubes:
fusion = self._construct_star_schema_db(cube_name)
return fusion[
[col for col in fusion.columns if col.lower()[-3:] != '_id']]
def get_all_tables_names(self, ignore_fact=False):
"""
......@@ -122,18 +325,30 @@ class MdxEngine:
return [tab for tab in self.tables_names if self.facts not in tab]
return self.tables_names
def _get_cube_path(self):
'''
:return: return local cube folder name with full path
'''
return os.path.join(
os.path.abspath(
os.path.join(
os.path.dirname(__file__), '..', "..", '..', '..')),
self.cube_folder)
def get_cube(self):
"""
get path to the cube (example /home/your_user_name/olapy-core/cubes)
:return: path to the cube
"""
return os.path.join(self.cube_path, self.cube)
# TODO temporary function
def get_tuples(self, query, start=None, stop=None):
# TODO use grako instead and remove regex
regex = "(\[[\w\d ]+\](\.\[[\w\d\.\- ]+\])*\.?((Members)|(\[Q\d\]))?)"
# french characters
# or use new regex 2017.02.08
regex = "(\[[\w+\d ]+\](\.\[[\w+\d\.\,\s\_\-\é\ù\è\ù\û\ü\ÿ\\\à\â\æ\ç\é\è\ê\ë\ï\î" \
"\ô\œ\Ù\Û\Ü\Ÿ\À\Â\Æ\Ç\É\È\Ê\Ë\Ï\Î\Ô\Œ\& ]+\])*\.?((Members)|(\[Q\d\]))?)"
if start is not None:
start = query.index(start)
......@@ -144,7 +359,9 @@ class MdxEngine:
return [[
tup_att.replace('All ', '').replace('[', "").replace("]", "")
for tup_att in tup[0].replace('.Members', '').split('.')
] for tup in re.compile(regex).findall(query[start:stop])
]
for tup in re.compile(regex).findall(
query.encode("utf-8")[start:stop])
if len(tup[0].split('.')) > 1]
# TODO temporary function
......@@ -156,7 +373,6 @@ class MdxEngine:
"""
tuples_on_mdx_query = self.get_tuples(query)
on_rows = []
on_columns = []
on_where = []
......@@ -449,7 +665,6 @@ class MdxEngine:
# use measures that exists on where or insides axes
query_axes = self.decorticate_query(self.mdx_query)
if self.change_measures(query_axes['all']):
self.selected_measures = self.change_measures(query_axes['all'])
......@@ -523,7 +738,8 @@ class MdxEngine:
return {
'result':
df.drop_duplicates().replace(np.nan, -1).groupby(cols).sum(),
'columns_desc': tables_n_columns
'columns_desc':
tables_n_columns
}
else:
......
from __future__ import absolute_import, division, print_function
import os
from lxml import etree
from .models import Cube, Facts, Dimension
class ConfigParser:
def __init__(self, cube_path, file_name='cubes-config.xml'):
self.cube_path = cube_path
self.file_name = file_name
def config_file_exist(self):
return os.path.isfile(os.path.join(self.cube_path, self.file_name))
def get_cubes_names(self):
with open(os.path.join(self.cube_path, self.file_name)) as config_file:
parser = etree.XMLParser()
tree = etree.parse(config_file, parser)
try:
return {
cube.find('name').text: cube.find('source').text
for cube in tree.xpath('/cubes/cube')
}
except:
raise ('missed name or source tags')
def construct_cubes(self):
if self.config_file_exist():
try:
with open(os.path.join(self.cube_path,
self.file_name)) as config_file:
parser = etree.XMLParser()
tree = etree.parse(config_file, parser)
facts = [
Facts(
table_name=xml_facts.find('table_name').text,
keys={
key.text: key.attrib['ref']
for key in xml_facts.findall(
'keys/column_name')
},
measures=[
mes.text
for mes in xml_facts.findall('measures/name')
]) for xml_facts in tree.xpath('/cubes/cube/facts')
]
dimensions = [
Dimension(
name=xml_dimension.find('name').text,
displayName=xml_dimension.find('displayName').text,
columns=[
column_name.text
for column_name in xml_dimension.findall(
'columns/name')
])
for xml_dimension in tree.xpath(
'/cubes/cube/dimensions/dimension')
]
return [
Cube(
name=xml_cube.find('name').text,
source=xml_cube.find('source').text,
facts=facts,
dimensions=dimensions)
for xml_cube in tree.xpath('/cubes/cube')
]
except:
raise ('Bad configuration in the configuration file')
else:
raise ("Config file don't exist")
import psycopg2 as pg
class MyDB(object):
def __init__(self, username='postgres', password='root', db=None):
if db is None:
self.connection = pg.connect(
"user={0} password={1}".format(username, password))
else:
self.connection = pg.connect("user={0} password={1} dbname='{2}'".
format(username, password, db))
def __del__(self):
self.connection.close()
class Facts:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
def __str__(self):
return str(self.__dict__)
class Dimension:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
def __str__(self):
return str(self.__dict__)
class Cube:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
def __str__(self):
return str(self.__dict__)
# -*- encoding: utf8 -*-
from __future__ import absolute_import, division, print_function
from datetime import datetime
......@@ -82,7 +84,8 @@ class XmlaProviderService(ServiceBase):
return discover_tools.discover_mdschema_measures__response(request)
elif request.RequestType == "MDSCHEMA_DIMENSIONS":
return discover_tools.discover_mdschema_dimensions_response(request)
return discover_tools.discover_mdschema_dimensions_response(
request)
elif request.RequestType == "MDSCHEMA_HIERARCHIES":
return discover_tools.discover_mdschema_hierarchies_response(
......@@ -100,7 +103,8 @@ class XmlaProviderService(ServiceBase):
request)
elif request.RequestType == "MDSCHEMA_PROPERTIES":
return discover_tools.discover_mdschema_properties_response(request)
return discover_tools.discover_mdschema_properties_response(
request)
elif request.RequestType == "MDSCHEMA_MEMBERS":
return discover_tools.discover_mdschema_members_response(request)
......@@ -137,6 +141,7 @@ class XmlaProviderService(ServiceBase):
executer.mdx_query = request.Command.Statement
df = executer.execute_mdx()
xmla_tools = XmlaExecuteTools(executer)
return etree.fromstring("""
<return>
<root xmlns="urn:schemas-microsoft-com:xml-analysis:mddataset"
......@@ -175,7 +180,16 @@ class XmlaProviderService(ServiceBase):
xmla_tools.generate_xs0(df),
xmla_tools.generate_slicer_axis(df),
xmla_tools.generate_cell_data(df),
datetime.now().strftime('%Y-%m-%dT%H:%M:%S')))
datetime.now().strftime('%Y-%m-%dT%H:%M:%S')).replace(
'&', '&amp;'))
# Problem:
# An XML parser returns the error “xmlParseEntityRef: noname”
#
# Cause:
# There is a stray ‘&’ (ampersand character) somewhere in the XML text eg. some text & some more text
# Solution
# .replace('&', '&amp;')
application = Application(
......
# -*- encoding: utf8 -*-
from __future__ import absolute_import, division, print_function
import uuid
......@@ -5,19 +7,14 @@ import uuid
from lxml import etree
from ..mdx.executor.execute import MdxEngine
from .xmla_discover_xsds import (dbschema_catalogs_xsd, dbschema_tables_xsd,
discover_datasources_xsd,
discover_literals_xsd,
discover_preperties_xsd,
discover_schema_rowsets_xsd,
mdschema_cubes_xsd, mdschema_dimensions_xsd,
mdschema_hierarchies_xsd, mdschema_kpis_xsd,
mdschema_levels_xsd, mdschema_measures_xsd,
mdschema_measuresgroups_dimensions_xsd,
mdschema_measuresgroups_xsd,
mdschema_members_xsd,
mdschema_properties_PROPERTIES_xsd,
mdschema_sets_xsd)
from .xmla_discover_xsds import (
dbschema_catalogs_xsd, dbschema_tables_xsd, discover_datasources_xsd,
discover_literals_xsd, discover_preperties_xsd,
discover_schema_rowsets_xsd, mdschema_cubes_xsd, mdschema_dimensions_xsd,
mdschema_hierarchies_xsd, mdschema_kpis_xsd, mdschema_levels_xsd,
mdschema_measures_xsd, mdschema_measuresgroups_dimensions_xsd,
mdschema_measuresgroups_xsd, mdschema_members_xsd,
mdschema_properties_PROPERTIES_xsd, mdschema_sets_xsd)
# TODO clean
......@@ -76,7 +73,6 @@ class XmlaDiscoverTools():
</return>""")
def discover_properties_response(self, request):
def get_props(xsd, PropertyName, PropertyDescription, PropertyType,
PropertyAccessType, IsRequired, Value):
return etree.fromstring("""
......@@ -118,15 +114,15 @@ class XmlaDiscoverTools():
elif request.Restrictions.RestrictionList.PropertyName == 'MdpropMdxSubqueries':
if 'Unspecified' in request.Properties.PropertyList.Catalog:
return get_props(discover_preperties_xsd, 'MdpropMdxSubqueries',
'MdpropMdxSubqueries', 'int', 'Read', 'false',
'15')
return get_props(discover_preperties_xsd,
'MdpropMdxSubqueries', 'MdpropMdxSubqueries',
'int', 'Read', 'false', '15')
if request.Properties.PropertyList.Catalog is not None:
self.change_catalogue(request.Properties.PropertyList.Catalog)
return get_props(discover_preperties_xsd, 'MdpropMdxSubqueries',
'MdpropMdxSubqueries', 'int', 'Read', 'false',
'15')
return get_props(discover_preperties_xsd,
'MdpropMdxSubqueries', 'MdpropMdxSubqueries',
'int', 'Read', 'false', '15')
elif request.Restrictions.RestrictionList.PropertyName == 'MdpropMdxDrillFunctions':
if 'Unspecified' in request.Properties.PropertyList.Catalog:
......@@ -1826,8 +1822,15 @@ class XmlaDiscoverTools():
request.Properties.PropertyList.Catalog)
rows = ""
ord = 1
for tables in self.executer.get_all_tables_names(
ignore_fact=True):
# TODO in another idea, change this
# TO CHANGE NAME DISPLAY THAT EXISTS IN CONFIG FILE
if MdxEngine.dimension_display_name != []:
if tables in MdxEngine.dimension_display_name:
continue
rows += """
<row>
<CATALOG_NAME>{0}</CATALOG_NAME>
......@@ -2264,6 +2267,9 @@ class XmlaDiscoverTools():
# separed_tuple -> [Product].[Product].[Company].[Crazy Development]
# joined -> [Product].[Product].[Company]
last_attribut = ''.join(att for att in separed_tuple[-1]
if att not in '[]').replace(
'&', '&amp;')
return etree.fromstring("""
<return>
<root xmlns="urn:schemas-microsoft-com:xml-analysis:rowset"
......@@ -2278,20 +2284,21 @@ class XmlaDiscoverTools():
<LEVEL_UNIQUE_NAME>{2}</LEVEL_UNIQUE_NAME>
<LEVEL_NUMBER>0</LEVEL_NUMBER>
<MEMBER_ORDINAL>0</MEMBER_ORDINAL>
<MEMBER_NAME>{4}</MEMBER_NAME>
<MEMBER_NAME>""" + last_attribut +
"""</MEMBER_NAME>
<MEMBER_UNIQUE_NAME>{3}</MEMBER_UNIQUE_NAME>
<MEMBER_TYPE>1</MEMBER_TYPE>
<MEMBER_CAPTION>{4}</MEMBER_CAPTION>
<MEMBER_CAPTION>""" + last_attribut +
"""</MEMBER_CAPTION>
<CHILDREN_CARDINALITY>1</CHILDREN_CARDINALITY>
<PARENT_LEVEL>0</PARENT_LEVEL>
<PARENT_COUNT>0</PARENT_COUNT>
<MEMBER_KEY>{4}</MEMBER_KEY>
<MEMBER_KEY>""" + last_attribut + """</MEMBER_KEY>
<IS_PLACEHOLDERMEMBER>false</IS_PLACEHOLDERMEMBER>
<IS_DATAMEMBER>false</IS_DATAMEMBER>
</row>
</root>
</return>
""".format(
self.selected_catalogue, separed_tuple[0], joined,
request.Restrictions.RestrictionList.MEMBER_UNIQUE_NAME,
''.join(c for c in separed_tuple[-1] if c not in '[]')))
""".format(self.selected_catalogue, separed_tuple[0],
joined, request.Restrictions.RestrictionList.
MEMBER_UNIQUE_NAME))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment