11from typing import Union , List , Sequence
2- from itertools import chain
32
43import numpy as np
54import pandas as pd
65import scipy .sparse as sp
76
87
9- modALinput = Union [list , np . ndarray , sp .csr_matrix , pd .DataFrame ]
8+ modALinput = Union [sp .csr_matrix , pd .DataFrame , np . ndarray , list ]
109
1110
1211def data_vstack (blocks : Sequence [modALinput ]) -> modALinput :
@@ -19,60 +18,137 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput:
1918 Returns:
2019 New sequence of vertically stacked elements.
2120 """
22- if isinstance (blocks [0 ], np .ndarray ):
23- return np .concatenate (blocks )
24- elif isinstance (blocks [0 ], list ):
25- return list (chain (blocks ))
26- elif sp .issparse (blocks [0 ]):
21+ if any ([sp .issparse (b ) for b in blocks ]):
2722 return sp .vstack (blocks )
2823 elif isinstance (blocks [0 ], pd .DataFrame ):
29- return blocks [0 ].append (blocks [1 ])
30- else :
31- try :
32- return np .concatenate (blocks )
33- except :
34- raise TypeError ('%s datatype is not supported' % type (blocks [0 ]))
24+ return blocks [0 ].append (blocks [1 :])
25+ elif isinstance (blocks [0 ], np .ndarray ):
26+ return np .concatenate (blocks )
27+ elif isinstance (blocks [0 ], list ):
28+ return np .concatenate (blocks ).tolist ()
29+
30+ raise TypeError ('%s datatype is not supported' % type (blocks [0 ]))
3531
3632
3733def data_hstack (blocks : Sequence [modALinput ]) -> modALinput :
3834 """
39- Stack horizontally both sparse and dense arrays
35+ Stack horizontally sparse/dense arrays and pandas data frames.
4036
4137 Args:
4238 blocks: Sequence of modALinput objects.
4339
4440 Returns:
4541 New sequence of horizontally stacked elements.
4642 """
47- # use sparse representation if any of the blocks do
4843 if any ([sp .issparse (b ) for b in blocks ]):
4944 return sp .hstack (blocks )
50-
51- try :
45+ elif isinstance (blocks [0 ], pd .DataFrame ):
46+ pd .concat (blocks , axis = 1 )
47+ elif isinstance (blocks [0 ], np .ndarray ):
5248 return np .hstack (blocks )
53- except :
54- raise TypeError ('%s datatype is not supported' % type (blocks [0 ]))
49+ elif isinstance (blocks [0 ], list ):
50+ return np .hstack (blocks ).tolist ()
51+
52+ TypeError ('%s datatype is not supported' % type (blocks [0 ]))
53+
54+
55+ def add_row (X :modALinput , row : modALinput ):
56+ """
57+ Returns X' =
58+
59+ [X
60+
61+ row]
62+ """
63+ if isinstance (X , np .ndarray ):
64+ return np .vstack ((X , row ))
65+ elif isinstance (X , list ):
66+ return np .vstack ((X , row )).tolist ()
67+
68+ # data_vstack readily supports stacking of matrix as first argument
69+ # and row as second for the other data types
70+ return data_vstack ([X , row ])
5571
5672
5773def retrieve_rows (X : modALinput ,
5874 I : Union [int , List [int ], np .ndarray ]) -> Union [sp .csc_matrix , np .ndarray , pd .DataFrame ]:
5975 """
6076 Returns the rows I from the data set X
77+
78+ For a single index, the result is as follows:
79+ * 1xM matrix in case of scipy sparse NxM matrix X
80+ * pandas series in case of a pandas data frame
81+ * row in case of list or numpy format
6182 """
62- if isinstance (X , pd .DataFrame ):
83+ if sp .issparse (X ):
84+ # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix,
85+ # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix
86+ # and sp.dia_matrix don't support indexing and need to be converted to a sparse format
87+ # that does support indexing. It seems conversion to CSR is currently most efficient.
88+
89+ try :
90+ return X [I ]
91+ except :
92+ sp_format = X .getformat ()
93+ return X .tocsr ()[I ].asformat (sp_format )
94+ elif isinstance (X , pd .DataFrame ):
6395 return X .iloc [I ]
96+ elif isinstance (X , np .ndarray ):
97+ return X [I ]
98+ elif isinstance (X , list ):
99+ return np .array (X )[I ].tolist ()
100+
101+ raise TypeError ('%s datatype is not supported' % type (X ))
64102
65- return X [I ]
66103
67104def drop_rows (X : modALinput ,
68105 I : Union [int , List [int ], np .ndarray ]) -> Union [sp .csc_matrix , np .ndarray , pd .DataFrame ]:
69- if isinstance (X , pd .DataFrame ):
106+ """
107+ Returns X without the row(s) at index/indices I
108+ """
109+ if sp .issparse (X ):
110+ mask = np .ones (X .shape [0 ], dtype = bool )
111+ mask [I ] = False
112+ return retrieve_rows (X , mask )
113+ elif isinstance (X , pd .DataFrame ):
70114 return X .drop (I , axis = 0 )
115+ elif isinstance (X , np .ndarray ):
116+ return np .delete (X , I , axis = 0 )
117+ elif isinstance (X , list ):
118+ return np .delete (X , I , axis = 0 ).tolist ()
119+
120+ raise TypeError ('%s datatype is not supported' % type (X ))
71121
72- return np .delete (X , I , axis = 0 )
73122
74123def enumerate_data (X : modALinput ):
75- if isinstance (X , pd .DataFrame ):
124+ """
125+ for i, x in enumerate_data(X):
126+
127+ Depending on the data type of X, returns:
128+
129+ * A 1xM matrix in case of scipy sparse NxM matrix X
130+ * pandas series in case of a pandas data frame X
131+ * row in case of list or numpy format
132+ """
133+ if sp .issparse (X ):
134+ return enumerate (X .tocsr ())
135+ elif isinstance (X , pd .DataFrame ):
76136 return X .iterrows ()
137+ elif isinstance (X , np .ndarray ) or isinstance (X , list ):
138+ # numpy arrays and lists can readily be enumerated
139+ return enumerate (X )
140+
141+ raise TypeError ('%s datatype is not supported' % type (X ))
142+
143+
144+ def data_shape (X : modALinput ):
145+ """
146+ Returns the shape of the data set X
147+ """
148+ if sp .issparse (X ) or isinstance (X , pd .DataFrame ) or isinstance (X , np .ndarray ):
149+ # scipy.sparse, pandas and numpy all support .shape
150+ return X .shape
151+ elif isinstance (X , list ):
152+ return np .array (X ).shape
77153
78- return enumerate ( X )
154+ raise TypeError ( '%s datatype is not supported' % type ( X ) )
0 commit comments