Commit 1cbff971 authored by Levin Zimmermann's avatar Levin Zimmermann

Allow patched pandas.read_* in restricted Python

See merge request !1615
parents 85317472 4360dbc6
Pipeline #21646 canceled with stage
in 0 seconds
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
# #
############################################################################## ##############################################################################
import json
import os.path import os.path
import tempfile import tempfile
import textwrap import textwrap
...@@ -572,14 +573,6 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase): ...@@ -572,14 +573,6 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase):
) )
def testPandasIORead(self): def testPandasIORead(self):
self.assertRaises(
Unauthorized,
self.createAndRunScript,
'''
import pandas as pd
pd.read_csv('testPandasIORead.csv')
''')
# Test the black_list configuration validity # Test the black_list configuration validity
for read_method in pandas_black_list: for read_method in pandas_black_list:
self.assertRaises( self.assertRaises(
...@@ -635,6 +628,148 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase): ...@@ -635,6 +628,148 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase):
write_method('testPandasSeriesIOWrite.data') write_method('testPandasSeriesIOWrite.data')
'''.format(write_method=write_method)) '''.format(write_method=write_method))
def _assertPandasRestrictedReadFunctionIsEqualTo(
self, read_function, read_argument, expected_data_frame_init
):
self.createAndRunScript(
'''
import pandas as pd
expected_data_frame = pd.DataFrame({expected_data_frame_init})
return pd.{read_function}({read_argument}).equals(expected_data_frame)
'''.format(
expected_data_frame_init=expected_data_frame_init,
read_function=read_function,
read_argument=read_argument,
),
expected=True
)
def testPandasRestrictedReadFunctionProhibitedInput(self):
"""
Test if patched pandas read_* functions raise with any input which isn't a string.
"""
for pandas_read_function in ("read_json", "read_csv", "read_fwf"):
for preparation, prohibited_input in (
('', 100),
('from StringIO import StringIO', 'StringIO("[1, 2, 3]")'),
):
self.assertRaises(
ZopeGuardsUnauthorized,
self.createAndRunScript,
'''
import pandas as pd
{preparation}
pd.{pandas_read_function}({prohibited_input})
'''.format(
preparation=preparation,
pandas_read_function=pandas_read_function,
prohibited_input=prohibited_input,
)
)
def testPandasReadFwf(self):
read_function = "read_fwf"
# Normal input should be correctly handled
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, r'"100\n200"', r"[[200]], columns=['100']",
)
# Ensure monkey patch parses keyword arguments to patched function
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, r'"1020\n3040", widths=[2, 2]', r"[[30, 40]], columns=['10', '20']",
)
# A string containing an url or file path should be handled as if
# it would be a normal csv string entry
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"file://path/to/fwf/file.fwf"',
r"[], columns=['file://path/to/fwf/file.fwf']",
)
def testPandasReadCSV(self):
read_function = "read_csv"
# Normal input should be correctly handled
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"11,2,300\n50.5,99,hello"',
r"[[50.5, 99, 'hello']], columns='11 2 300'.split(' ')",
)
# Ensure monkey patch parses keyword arguments to patched function
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, r'"a;b", sep=";"', r"[], columns=['a', 'b']",
)
# A string containing an url or file path should be handled as if
# it would be a normal csv string entry
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv"',
r"[], columns=['https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv']",
)
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"file://path/to/csv/file.csv"',
r"[], columns=['file://path/to/csv/file.csv']",
)
def testPandasReadJsonParsesInput(self):
read_function = "read_json"
# Normal input should be correctly handled
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, '"[1, 2, 3]"', "[1, 2, 3]"
)
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
'\'{"column_name": [1, 2, 3], "another_column": [3, 9.2, 100]}\'',
'{"column_name": [1, 2, 3], "another_column": [3, 9.2, 100]}',
)
# Ensure monkey patch parses keyword arguments to patched function
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"[1, 2, 3]\n[4, 5, 6]", lines=True',
"[[1, 2, 3], [4, 5, 6]]",
)
# URLs, etc. should raise a ValueError
# (see testPandasReadJsonProhibitsMalicousString)
def testPandasReadJsonProhibitsMalicousString(self):
"""
Test if file path, urls and other bad strings
raise value errors
"""
# Create valid json file which could be read
# by a non-patched read_json function.
test_file_path = ".testPandasReadJson.json"
json_test_data = [1, 2, 3]
with open(test_file_path, 'w') as json_file:
json.dump(json_test_data, json_file)
self.addCleanup(os.remove, test_file_path)
# Ensure json creation was successful
self.assertTrue(os.path.isfile(test_file_path))
with open(test_file_path, "r") as json_file:
self.assertEqual(json_test_data, json.loads(json_file.read()))
for malicous_input in (
# If pandas would read this as an URL it should
# raise an URLError. But because it will try
# to read it as a json string, it will raise
# a ValueError.
"https://test-url.com/test-name.json",
"file://path/to/json/file.json",
# This shouldn't raise any error in case
# pandas read function wouldn't be patched.
test_file_path,
# Gibberish should also raise a ValueError
"Invalid-string"
):
self.assertRaises(
ValueError,
self.createAndRunScript,
'''
import pandas as pd
pd.read_json("{}")
'''.format(malicous_input)
)
def test_suite(): def test_suite():
suite = unittest.TestSuite() suite = unittest.TestSuite()
......
##############################################################################
#
# Copyright (c) 2012 Nexedi SARL and Contributors. All Rights Reserved.
# Levin Zimmermann <levin.zimmermann@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
"""
Restricted pandas module.
From restricted python, use "import pandas" (see patches/Restricted.py).
"""
from pandas import *
# Add restricted versions of IO functions
import six as _six
from AccessControl.ZopeGuards import Unauthorized as _ZopeGuardsUnauthorized
if _six.PY2:
from StringIO import StringIO as _StringIO
else:
from io import StringIO as _StringIO
def _addRestrictedPandasReadFunction(function_name):
original_function = getattr(__import__('pandas'), function_name)
def Pandas_read(data_string, *args, **kwargs):
# Strict: don't use 'isinstance', only allow buildin str
# objects
if type(data_string) is not str:
raise _ZopeGuardsUnauthorized(
"Parsing object '%s' of type '%s' is prohibited!" % (data_string, type(data_string))
)
string_io = _StringIO(data_string)
return original_function(string_io, *args, **kwargs)
disclaimer = """\n
Disclaimer:
This function has been patched by ERP5 for zope sandbox usage.
Only objects of type 'str' are valid inputs, file paths, files,
urls, etc. are prohibited or ignored.
"""
Pandas_read.__doc__ = original_function.__doc__ + disclaimer
globals().update({function_name: Pandas_read})
def _addRestrictedPandasReadFunctionTuple():
pandas_read_function_to_restrict_tuple = (
"read_json",
# "read_html", # needs installation of additional dependency: html5lib
"read_csv",
"read_fwf",
# "read_xml", # only available for pandas version >= 1.3.0
)
for pandas_read_function_to_restrict in pandas_read_function_to_restrict_tuple:
_addRestrictedPandasReadFunction(pandas_read_function_to_restrict)
_addRestrictedPandasReadFunctionTuple()
\ No newline at end of file
...@@ -371,6 +371,7 @@ MNAME_MAP = { ...@@ -371,6 +371,7 @@ MNAME_MAP = {
'calendar': 'Products.ERP5Type.Calendar', 'calendar': 'Products.ERP5Type.Calendar',
'collections': 'Products.ERP5Type.Collections', 'collections': 'Products.ERP5Type.Collections',
'six': 'Products.ERP5Type.Six', 'six': 'Products.ERP5Type.Six',
'pandas': 'Products.ERP5Type.Pandas',
} }
for alias, real in six.iteritems(MNAME_MAP): for alias, real in six.iteritems(MNAME_MAP):
assert '.' not in alias, alias # TODO: support this assert '.' not in alias, alias # TODO: support this
...@@ -478,23 +479,20 @@ def restrictedMethod(s,name): ...@@ -478,23 +479,20 @@ def restrictedMethod(s,name):
raise Unauthorized(name) raise Unauthorized(name)
return dummyMethod return dummyMethod
try: try:
import pandas as pd import pandas as pd
except ImportError: except ImportError:
pass pass
else: else:
allow_module('pandas')
allow_type(pd.Series)
allow_type(pd.Timestamp) allow_type(pd.Timestamp)
allow_type(pd.DatetimeIndex) allow_type(pd.DatetimeIndex)
# XXX: pd.DataFrame has its own security thus disable
# until we can fully integrate it
#allow_type(pd.DataFrame)
allow_type(pd.MultiIndex) allow_type(pd.MultiIndex)
allow_type(pd.indexes.range.RangeIndex) allow_type(pd.indexes.range.RangeIndex)
allow_type(pd.indexes.numeric.Int64Index) allow_type(pd.indexes.numeric.Int64Index)
allow_type(pd.core.groupby.DataFrameGroupBy) allow_type(pd.core.groupby.DataFrameGroupBy)
allow_type(pd.core.groupby.SeriesGroupBy) allow_type(pd.core.groupby.SeriesGroupBy)
allow_class(pd.DataFrame) allow_class(pd.DataFrame)
# Note: These black_list methods are for pandas 0.19.2 # Note: These black_list methods are for pandas 0.19.2
...@@ -503,10 +501,10 @@ else: ...@@ -503,10 +501,10 @@ else:
ContainerAssertions[pd.Series] = _check_access_wrapper( ContainerAssertions[pd.Series] = _check_access_wrapper(
pd.Series, dict.fromkeys(series_black_list, restrictedMethod)) pd.Series, dict.fromkeys(series_black_list, restrictedMethod))
pandas_black_list = ('read_csv', 'read_json', 'read_pickle', 'read_hdf', pandas_black_list = ('read_pickle', 'read_hdf',
'read_fwf', 'read_excel', 'read_html', 'read_msgpack', 'read_excel', 'read_html', 'read_msgpack',
'read_gbq', 'read_sas', 'read_stata') 'read_gbq', 'read_sas', 'read_stata')
ModuleSecurityInfo('pandas').declarePrivate(*pandas_black_list) ModuleSecurityInfo(MNAME_MAP['pandas']).declarePrivate(*pandas_black_list)
dataframe_black_list = ('to_csv', 'to_json', 'to_pickle', 'to_hdf', dataframe_black_list = ('to_csv', 'to_json', 'to_pickle', 'to_hdf',
'to_excel', 'to_html', 'to_sql', 'to_msgpack', 'to_excel', 'to_html', 'to_sql', 'to_msgpack',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment