In [1]:
import numpy as np
import pandas as pd

numpy¶

In [2]:
np.__version__
Out[2]:
'1.23.5'

numpy中的数据存储是array, array要求数据类型是一样的,这和python中的list不同

In [3]:
np.array([1,1.0,'2']) # 强制转换成同一种类型
Out[3]:
array(['1', '1.0', '2'], dtype='<U32')
In [4]:
np.array([1,1.0,'2'],dtype=int)
Out[4]:
array([1, 1, 2])

generating Array¶

In [5]:
np.ones((3,5))
Out[5]:
array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])
In [6]:
np.full((3,5),1.231)
Out[6]:
array([[1.231, 1.231, 1.231, 1.231, 1.231],
       [1.231, 1.231, 1.231, 1.231, 1.231],
       [1.231, 1.231, 1.231, 1.231, 1.231]])
In [7]:
np.arange(0, 20)
Out[7]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])
In [8]:
np.random.normal(0, 1, (3,3))
Out[8]:
array([[ 1.27274173, -2.34781348, -0.31959194],
       [ 0.87566819,  0.09926286, -1.11582647],
       [ 1.5545384 ,  0.17286124,  1.50142145]])
In [9]:
np.eye(3)
Out[9]:
array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])
In [14]:
np.tile?
Signature: np.tile(A, reps)
Docstring:
Construct an array by repeating A the number of times given by reps.

If `reps` has length ``d``, the result will have dimension of
``max(d, A.ndim)``.

If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
or shape (1, 1, 3) for 3-D replication. If this is not the desired
behavior, promote `A` to d-dimensions manually before calling this
function.

If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
(1, 1, 2, 2).

Note : Although tile may be used for broadcasting, it is strongly
recommended to use numpy's broadcasting operations and functions.

Parameters
----------
A : array_like
    The input array.
reps : array_like
    The number of repetitions of `A` along each axis.

Returns
-------
c : ndarray
    The tiled output array.

See Also
--------
repeat : Repeat elements of an array.
broadcast_to : Broadcast an array to a new shape

Examples
--------
>>> a = np.array([0, 1, 2])
>>> np.tile(a, 2)
array([0, 1, 2, 0, 1, 2])
>>> np.tile(a, (2, 2))
array([[0, 1, 2, 0, 1, 2],
       [0, 1, 2, 0, 1, 2]])
>>> np.tile(a, (2, 1, 2))
array([[[0, 1, 2, 0, 1, 2]],
       [[0, 1, 2, 0, 1, 2]]])

>>> b = np.array([[1, 2], [3, 4]])
>>> np.tile(b, 2)
array([[1, 2, 1, 2],
       [3, 4, 3, 4]])
>>> np.tile(b, (2, 1))
array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

>>> c = np.array([1,2,3,4])
>>> np.tile(c,(4,1))
array([[1, 2, 3, 4],
       [1, 2, 3, 4],
       [1, 2, 3, 4],
       [1, 2, 3, 4]])
File:      ~/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/shape_base.py
Type:      function
In [10]:
np.tile(10,3)
Out[10]:
array([10, 10, 10])
In [11]:
np.tile([1,2,3],3)
Out[11]:
array([1, 2, 3, 1, 2, 3, 1, 2, 3])
In [15]:
np.repeat?
Signature: np.repeat(a, repeats, axis=None)
Docstring:
Repeat elements of an array.

Parameters
----------
a : array_like
    Input array.
repeats : int or array of ints
    The number of repetitions for each element.  `repeats` is broadcasted
    to fit the shape of the given axis.
axis : int, optional
    The axis along which to repeat values.  By default, use the
    flattened input array, and return a flat output array.

Returns
-------
repeated_array : ndarray
    Output array which has the same shape as `a`, except along
    the given axis.

See Also
--------
tile : Tile an array.
unique : Find the unique elements of an array.

Examples
--------
>>> np.repeat(3, 4)
array([3, 3, 3, 3])
>>> x = np.array([[1,2],[3,4]])
>>> np.repeat(x, 2)
array([1, 1, 2, 2, 3, 3, 4, 4])
>>> np.repeat(x, 3, axis=1)
array([[1, 1, 1, 2, 2, 2],
       [3, 3, 3, 4, 4, 4]])
>>> np.repeat(x, [1, 2], axis=0)
array([[1, 2],
       [3, 4],
       [3, 4]])
File:      ~/opt/anaconda3/lib/python3.9/site-packages/numpy/core/fromnumeric.py
Type:      function
In [16]:
np.asarray([1,2,3]).repeat(3)
Out[16]:
array([1, 1, 1, 2, 2, 2, 3, 3, 3])

Array indexing and slicing¶

In [17]:
a = np.random.normal(0, 1, (3,3))
a
Out[17]:
array([[-0.49915155,  0.8337671 ,  0.40341252],
       [-1.36387181, -0.18727799, -0.35112724],
       [ 0.77928131, -0.36764775,  0.59875779]])
In [18]:
a[0,1]
Out[18]:
0.8337671001714373
In [19]:
a[0:2]
Out[19]:
array([[-0.49915155,  0.8337671 ,  0.40341252],
       [-1.36387181, -0.18727799, -0.35112724]])
In [20]:
a[0:2,1]
Out[20]:
array([ 0.8337671 , -0.18727799])
In [21]:
a[0,1] = 1000
In [22]:
a
Out[22]:
array([[-4.99151549e-01,  1.00000000e+03,  4.03412521e-01],
       [-1.36387181e+00, -1.87277987e-01, -3.51127242e-01],
       [ 7.79281310e-01, -3.67647747e-01,  5.98757794e-01]])
In [23]:
a[0:3:2]
Out[23]:
array([[-4.99151549e-01,  1.00000000e+03,  4.03412521e-01],
       [ 7.79281310e-01, -3.67647747e-01,  5.98757794e-01]])

Array concatenation¶

In [24]:
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
z = [21,21,21]
np.concatenate([x,y,z])
Out[24]:
array([ 1,  2,  3,  3,  2,  1, 21, 21, 21])
In [25]:
a1 = np.ones((3,3))
a2 = np.random.normal(0, 1, (3,3))
In [26]:
a1
Out[26]:
array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])
In [27]:
a2
Out[27]:
array([[-2.0217118 ,  1.11949095, -1.25693857],
       [-0.51433944,  0.9210426 ,  0.31195337],
       [ 1.53190446,  2.61674289,  0.58387914]])
In [28]:
np.concatenate([a1,a2])
Out[28]:
array([[ 1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [-2.0217118 ,  1.11949095, -1.25693857],
       [-0.51433944,  0.9210426 ,  0.31195337],
       [ 1.53190446,  2.61674289,  0.58387914]])
In [29]:
np.concatenate([a1,a2], axis=1)
Out[29]:
array([[ 1.        ,  1.        ,  1.        , -2.0217118 ,  1.11949095,
        -1.25693857],
       [ 1.        ,  1.        ,  1.        , -0.51433944,  0.9210426 ,
         0.31195337],
       [ 1.        ,  1.        ,  1.        ,  1.53190446,  2.61674289,
         0.58387914]])

Array reshaping¶

In [30]:
a1 = np.array(range(10))
a1[0:3]
Out[30]:
array([0, 1, 2])
In [31]:
a1
Out[31]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [32]:
a1.shape
Out[32]:
(10,)
In [33]:
a1 = a1.reshape((10,1))
In [34]:
a1.shape
Out[34]:
(10, 1)
In [35]:
a1
Out[35]:
array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])
In [36]:
a1 = a1.reshape((1,10))
In [37]:
a1.shape
Out[37]:
(1, 10)
In [38]:
a1
Out[38]:
array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
In [39]:
a1.reshape((2,6))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[39], line 1
----> 1 a1.reshape((2,6))

ValueError: cannot reshape array of size 10 into shape (2,6)
In [40]:
a1 = a1.reshape((2,5))
In [41]:
a1
Out[41]:
array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])
In [42]:
a2 = np.array(range(10,20)).reshape((2,5))
In [43]:
a2
Out[43]:
array([[10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])
In [44]:
a1
Out[44]:
array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])
In [45]:
np.vstack([a1,a2])
Out[45]:
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])
In [46]:
np.hstack([a1,a2])
Out[46]:
array([[ 0,  1,  2,  3,  4, 10, 11, 12, 13, 14],
       [ 5,  6,  7,  8,  9, 15, 16, 17, 18, 19]])

Vectorized manipulations¶

做numpy和pandas操作时的一个原则:能用向量操作尽量用向量操作,速度最快。尽量避免写元素的循环

In [47]:
a1 + 1000
Out[47]:
array([[1000, 1001, 1002, 1003, 1004],
       [1005, 1006, 1007, 1008, 1009]])
In [48]:
a1 * a2
Out[48]:
array([[  0,  11,  24,  39,  56],
       [ 75,  96, 119, 144, 171]])
In [49]:
list1 = [1,2,3] # some manipulations is not possible with raw python
list2 = [4,5,6]
In [50]:
list1 * list2
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[50], line 1
----> 1 list1 * list2

TypeError: can't multiply sequence by non-int of type 'list'
In [51]:
list1 + list2
Out[51]:
[1, 2, 3, 4, 5, 6]
In [52]:
x = np.array([[1, 2, 3], [4, 5, 6]])
x.ravel()
Out[52]:
array([1, 2, 3, 4, 5, 6])
In [53]:
x.ravel(order='F')
Out[53]:
array([1, 4, 2, 5, 3, 6])

Calculations with numpy¶

In [54]:
a1
Out[54]:
array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])
In [55]:
a1.max()
Out[55]:
9

axis: 0,从上往下(跨行)。1,从左往右(跨列)

也适用于pandas

In [56]:
a1.max(axis=1)
Out[56]:
array([4, 9])
In [57]:
a1.max(axis=0)
Out[57]:
array([5, 6, 7, 8, 9])
In [58]:
a1.sum()
Out[58]:
45
In [59]:
a1.sum(axis=0)
Out[59]:
array([ 5,  7,  9, 11, 13])
In [60]:
a1.sum(axis=1)
Out[60]:
array([10, 35])
In [61]:
a1.mean()
Out[61]:
4.5
In [62]:
a1
Out[62]:
array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])
In [63]:
a1.mean(axis=1)
Out[63]:
array([2., 7.])
In [64]:
a1.mean(axis=0)
Out[64]:
array([2.5, 3.5, 4.5, 5.5, 6.5])

Masked arrays¶

Mask 把 array 的一部分“挡住”,来阻止其他的函数对其进行操作。这在有的时候挺有用

In [65]:
import numpy.ma as ma
In [66]:
x = np.array([1,2,3,-1,4])
In [67]:
mx = ma.masked_array(x, mask=[0,0,0,1,0])
In [68]:
mx
Out[68]:
masked_array(data=[1, 2, 3, --, 4],
             mask=[False, False, False,  True, False],
       fill_value=999999)
In [69]:
mx.mean()
Out[69]:
2.5
In [70]:
# -999 as an invalid value, as sometimes seen in some databases
a = np.array([0,1,2,-999])
ma.masked_values(a, -999)
Out[70]:
masked_array(data=[0, 1, 2, --],
             mask=[False, False, False,  True],
       fill_value=-999)

pandas¶

  • 2008年由AQR工作的 Wes McKinney创始
  • 受到R中data.frame的影响很大
  • 其目的是为python提供一个便于操作金融数据的框架
  • 名称来源:panel data analysis
  • 已广泛应用于各类表格类(tabular)的数据操作

Create a data frame¶

In [71]:
df = pd.DataFrame({'name':[chr(i) for i in range(ord('a'),ord('z')+1)],
                   'value':range(26)}) # string.ascii_lowercase provides the alphabet too.
df
Out[71]:
name value
0 a 0
1 b 1
2 c 2
3 d 3
4 e 4
5 f 5
6 g 6
7 h 7
8 i 8
9 j 9
10 k 10
11 l 11
12 m 12
13 n 13
14 o 14
15 p 15
16 q 16
17 r 17
18 s 18
19 t 19
20 u 20
21 v 21
22 w 22
23 x 23
24 y 24
25 z 25
In [72]:
data = [['tom', 10], ['nick', 15], ['juli', 14]]
df = pd.DataFrame(data, columns = ['Name', 'Age'], index=['a','b','c'])
df
Out[72]:
Name Age
a tom 10
b nick 15
c juli 14

Quick production of dataframe examples with pandas._testing¶

In [73]:
import pandas._testing as tm
In [74]:
dir(tm)
Out[74]:
['ALL_INT_EA_DTYPES',
 'ALL_INT_NUMPY_DTYPES',
 'ALL_NUMPY_DTYPES',
 'ALL_REAL_NUMPY_DTYPES',
 'BOOL_DTYPES',
 'BYTES_DTYPES',
 'BaseMaskedArray',
 'COMPLEX_DTYPES',
 'Callable',
 'Categorical',
 'CategoricalIndex',
 'ContextManager',
 'Counter',
 'DATETIME64_DTYPES',
 'DataFrame',
 'DatetimeIndex',
 'Decimal',
 'Dtype',
 'EMPTY_STRING_PATTERN',
 'ENDIAN',
 'ExtensionArray',
 'FLOAT_EA_DTYPES',
 'FLOAT_NUMPY_DTYPES',
 'Float64Index',
 'Index',
 'Int64Index',
 'IntervalIndex',
 'Iterable',
 'MultiIndex',
 'NARROW_NP_DTYPES',
 'NDArrayBackedExtensionArray',
 'NP_NAT_OBJECTS',
 'NULL_OBJECTS',
 'NumericIndex',
 'OBJECT_DTYPES',
 'PandasArray',
 'RNGContext',
 'RangeIndex',
 'SIGNED_INT_EA_DTYPES',
 'SIGNED_INT_NUMPY_DTYPES',
 'STRING_DTYPES',
 'Series',
 'SubclassedCategorical',
 'SubclassedDataFrame',
 'SubclassedSeries',
 'TIMEDELTA64_DTYPES',
 'TYPE_CHECKING',
 'UInt64Index',
 'UNSIGNED_INT_EA_DTYPES',
 'UNSIGNED_INT_NUMPY_DTYPES',
 '_K',
 '_N',
 '__all__',
 '__annotations__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_create_missing_idx',
 '_io',
 '_make_skipna_wrapper',
 '_random',
 '_testing_mode_warnings',
 '_warnings',
 'all_timeseries_index_generator',
 'annotations',
 'assert_almost_equal',
 'assert_attr_equal',
 'assert_categorical_equal',
 'assert_class_equal',
 'assert_contains_all',
 'assert_copy',
 'assert_datetime_array_equal',
 'assert_dict_equal',
 'assert_equal',
 'assert_extension_array_equal',
 'assert_frame_equal',
 'assert_index_equal',
 'assert_indexing_slices_equivalent',
 'assert_interval_array_equal',
 'assert_is_sorted',
 'assert_is_valid_plot_return_object',
 'assert_metadata_equivalent',
 'assert_numpy_array_equal',
 'assert_period_array_equal',
 'assert_produces_warning',
 'assert_series_equal',
 'assert_sp_array_equal',
 'assert_timedelta_array_equal',
 'asserters',
 'at',
 'bdate_range',
 'box_expected',
 'byteorder',
 'can_set_locale',
 'close',
 'collections',
 'compat',
 'contexts',
 'convert_rows_list_to_csv_str',
 'cython_table',
 'datetime',
 'decompress_file',
 'ensure_clean',
 'ensure_clean_dir',
 'ensure_safe_environment_variables',
 'equalContents',
 'external_error_raised',
 'extract_array',
 'getCols',
 'getMixedTypeDict',
 'getPeriodData',
 'getSeriesData',
 'getTimeSeriesData',
 'get_cython_table_params',
 'get_dtype',
 'get_locales',
 'get_obj',
 'get_op_from_name',
 'getitem',
 'iat',
 'iloc',
 'index_subclass_makers_generator',
 'is_float_dtype',
 'is_integer_dtype',
 'is_sequence',
 'is_unsigned_integer_dtype',
 'loc',
 'makeBoolIndex',
 'makeCategoricalIndex',
 'makeCustomDataframe',
 'makeCustomIndex',
 'makeDataFrame',
 'makeDateIndex',
 'makeFloatIndex',
 'makeFloatSeries',
 'makeIntIndex',
 'makeIntervalIndex',
 'makeMissingDataframe',
 'makeMixedDataFrame',
 'makeMultiIndex',
 'makeNumericIndex',
 'makeObjectSeries',
 'makePeriodFrame',
 'makePeriodIndex',
 'makePeriodSeries',
 'makeRangeIndex',
 'makeStringIndex',
 'makeStringSeries',
 'makeTimeDataFrame',
 'makeTimeSeries',
 'makeTimedeltaIndex',
 'makeUIntIndex',
 'make_rand_series',
 'maybe_produces_warning',
 'network',
 'np',
 'operator',
 'os',
 'pa_version_under1p01',
 'pandas_dtype',
 'pd',
 'raise_assert_detail',
 'randbool',
 'rands',
 'rands_array',
 're',
 'reset_display_options',
 'reset_testing_mode',
 'round_trip_localpath',
 'round_trip_pathlib',
 'round_trip_pickle',
 'set_locale',
 'set_testing_mode',
 'set_timezone',
 'setitem',
 'shares_memory',
 'string',
 'to_array',
 'use_numexpr',
 'warnings',
 'with_csv_dialect',
 'write_to_compressed']
In [75]:
tm.makeDataFrame()
Out[75]:
A B C D
VHunICcLPJ -0.795996 -0.351695 -1.385583 -0.753557
NESKybUEDH -1.269138 0.863860 -0.976737 0.766969
qMAMB3xDut 1.472684 0.478087 -0.239865 0.353478
JRlRCQUzuu -0.447649 -0.333359 0.119227 -0.246769
U5jtekM3k2 -0.132406 -0.737123 1.728766 -1.153852
PrBiVPdIaJ 0.382802 0.364979 1.206549 -2.258635
unVAHNdKYu -1.072163 0.740952 -0.389145 1.039403
MotvwNxPmd 0.066330 -1.973623 1.012141 -1.102447
89m1l3btYy 0.016252 0.555613 0.298718 0.980350
9HTnDXF40x -0.108969 -1.131245 -1.315161 0.365393
LRkcwE7Lmx 0.696143 -1.422110 1.337381 -1.690665
psnMfLek7U -0.553044 -1.167765 -1.129249 0.267534
HeABMPzPXF -0.115510 0.721029 -0.249622 1.093956
2xHL40vnf2 0.406484 0.816362 0.198217 -0.515139
KSCjLRTaae -0.521772 0.186359 -1.287775 0.931270
PZMnNoQpiJ 1.906866 0.014740 -0.441951 -0.433909
iJbHYwo277 0.999935 0.812229 0.436503 -0.571638
qc8tNYVRCg -0.015067 -0.199682 -1.991939 -1.358378
zK2FmNbFSj -0.098908 0.357793 -0.701900 0.500967
vooqmpNXRy 0.214110 -0.027231 -0.392369 -1.650658
Ghj8CJjGha -1.161240 0.359830 -0.860943 1.392138
ehhvv804ye 0.819537 1.287188 -0.581673 0.636301
9jtiXBZ1y7 -0.128448 0.599718 1.732266 0.492705
RDhWwjCJDA 0.216929 0.210923 0.136244 -0.541380
71Bkvd5XbY -1.188419 0.324790 1.285649 0.176251
kZPHgLtScF -0.667199 0.597654 0.400407 0.139640
jpEWrtGEYv -0.327816 3.228324 -1.162103 1.216709
kE3hNb4qjr -0.570806 -0.630258 0.648322 0.823451
XNQiSNBQGJ 0.466998 -0.074483 -0.559314 1.242169
k6gL5YU8dQ 0.325523 -0.585791 -0.758267 0.014827
In [76]:
tm.makeMixedDataFrame().info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       5 non-null      float64       
 1   B       5 non-null      float64       
 2   C       5 non-null      object        
 3   D       5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 288.0+ bytes
In [77]:
df1 = tm.makeTimeDataFrame()
df1 
Out[77]:
A B C D
2000-01-03 -0.144370 0.201565 0.516506 1.144586
2000-01-04 -2.145394 0.692739 -0.207220 1.570583
2000-01-05 0.282223 -0.412550 -0.375129 1.004230
2000-01-06 -0.199454 -1.049554 -0.416481 0.601689
2000-01-07 0.892563 -0.422116 -0.497795 2.062563
2000-01-10 1.952667 -2.366110 -0.335015 0.092206
2000-01-11 1.182459 -2.054859 -0.543254 0.323785
2000-01-12 -0.367848 -0.032513 0.021224 0.768913
2000-01-13 0.023308 -0.701854 -2.019472 -0.251018
2000-01-14 1.410542 -1.622660 0.443647 -0.933541
2000-01-17 0.783971 -0.007375 0.296998 -0.363226
2000-01-18 -1.361430 0.153546 -0.619399 1.146770
2000-01-19 1.346466 -1.668749 -1.303989 -0.543123
2000-01-20 0.347398 0.848440 1.952129 -0.167775
2000-01-21 -1.510918 1.309377 -1.564935 -0.795824
2000-01-24 -0.307359 0.502352 -0.047250 0.499364
2000-01-25 0.987366 0.992062 -0.777740 -0.188208
2000-01-26 0.187069 0.468695 1.160078 1.104165
2000-01-27 -1.191208 0.324158 -0.277405 -0.871390
2000-01-28 -1.403417 0.168218 -0.297890 -0.398667
2000-01-31 -0.786557 -0.161181 0.714900 0.916390
2000-02-01 -0.541403 -0.881791 1.270019 -0.222928
2000-02-02 -1.331567 -0.279310 2.249028 -0.024690
2000-02-03 0.665224 -1.476585 1.902657 -0.254340
2000-02-04 2.076048 1.329543 -0.446659 0.007978
2000-02-07 0.936564 -0.062680 -1.205261 -1.963803
2000-02-08 -0.737788 0.516168 -0.707155 0.394130
2000-02-09 1.030261 1.145068 -1.908743 -0.645290
2000-02-10 0.404236 0.528930 0.481302 -0.483042
2000-02-11 0.498774 -0.133681 -0.291691 1.570372
In [78]:
df2 = tm.makePeriodFrame()
df2
Out[78]:
A B C D
2000-01-03 1.429051 -0.580283 0.990025 -0.257934
2000-01-04 -0.110579 1.306100 1.963598 0.894591
2000-01-05 0.350137 0.145053 -2.283949 -0.210955
2000-01-06 0.683855 0.319405 1.167300 0.572679
2000-01-07 -1.676209 -0.482623 0.261776 0.680530
2000-01-10 0.328323 -2.349310 0.426277 0.120216
2000-01-11 1.065787 0.320567 0.342133 0.311434
2000-01-12 -0.379241 0.143673 -1.389723 1.336481
2000-01-13 -1.042176 -0.138300 -0.909156 -1.352213
2000-01-14 -0.621671 1.315829 -0.465050 1.242555
2000-01-17 1.035397 0.310866 -0.353529 -1.049046
2000-01-18 1.115317 0.182585 -1.047423 -0.665544
2000-01-19 1.324550 -0.762196 1.100811 -0.287230
2000-01-20 0.824192 0.326103 0.232408 -0.035676
2000-01-21 0.073493 -0.343796 -0.876300 1.399121
2000-01-24 1.214491 1.377404 -0.186971 0.479992
2000-01-25 -0.558259 1.458651 -0.045784 -0.754432
2000-01-26 0.525250 -0.197330 -1.364182 -0.761588
2000-01-27 0.049187 -1.423574 1.432088 -1.856000
2000-01-28 0.342931 -1.934350 0.598586 -0.807426
2000-01-31 -0.103926 0.024246 -1.330241 -0.074468
2000-02-01 1.559401 1.679189 1.009991 -0.042107
2000-02-02 0.269948 1.031486 0.892775 -0.810168
2000-02-03 -0.560028 0.091427 1.024934 0.117938
2000-02-04 1.881646 0.610332 -0.231905 -0.371461
2000-02-07 -0.942971 -0.999706 -0.711167 2.733572
2000-02-08 0.618878 -0.563883 1.723752 0.194319
2000-02-09 -0.678689 0.367987 1.309123 0.827901
2000-02-10 -1.052600 1.226809 0.871817 -0.603279
2000-02-11 -0.998651 -0.635610 0.180419 0.739663

Explorations on DatetimeIndex, PeriodIndex¶

In [79]:
df1.index
Out[79]:
DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
               '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',
               '2000-01-13', '2000-01-14', '2000-01-17', '2000-01-18',
               '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-24',
               '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
               '2000-01-31', '2000-02-01', '2000-02-02', '2000-02-03',
               '2000-02-04', '2000-02-07', '2000-02-08', '2000-02-09',
               '2000-02-10', '2000-02-11'],
              dtype='datetime64[ns]', freq='B')
In [80]:
df1.index.day_of_week
Out[80]:
Int64Index([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1,
            2, 3, 4, 0, 1, 2, 3, 4],
           dtype='int64')
In [81]:
df1.index.shift(periods=1) # 时间序列index的shift,会按照指定的period或freq平移。注意:如果没有时间序列,则直接平移,见df1['A']例
Out[81]:
DatetimeIndex(['2000-01-04', '2000-01-05', '2000-01-06', '2000-01-07',
               '2000-01-10', '2000-01-11', '2000-01-12', '2000-01-13',
               '2000-01-14', '2000-01-17', '2000-01-18', '2000-01-19',
               '2000-01-20', '2000-01-21', '2000-01-24', '2000-01-25',
               '2000-01-26', '2000-01-27', '2000-01-28', '2000-01-31',
               '2000-02-01', '2000-02-02', '2000-02-03', '2000-02-04',
               '2000-02-07', '2000-02-08', '2000-02-09', '2000-02-10',
               '2000-02-11', '2000-02-14'],
              dtype='datetime64[ns]', freq='B')
In [82]:
df1.index.shift(freq='D')
Out[82]:
DatetimeIndex(['2000-01-04', '2000-01-05', '2000-01-06', '2000-01-07',
               '2000-01-08', '2000-01-11', '2000-01-12', '2000-01-13',
               '2000-01-14', '2000-01-15', '2000-01-18', '2000-01-19',
               '2000-01-20', '2000-01-21', '2000-01-22', '2000-01-25',
               '2000-01-26', '2000-01-27', '2000-01-28', '2000-01-29',
               '2000-02-01', '2000-02-02', '2000-02-03', '2000-02-04',
               '2000-02-05', '2000-02-08', '2000-02-09', '2000-02-10',
               '2000-02-11', '2000-02-12'],
              dtype='datetime64[ns]', freq=None)
In [83]:
df1.index.shift(freq='2D')
Out[83]:
DatetimeIndex(['2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-12', '2000-01-13', '2000-01-14',
               '2000-01-15', '2000-01-16', '2000-01-19', '2000-01-20',
               '2000-01-21', '2000-01-22', '2000-01-23', '2000-01-26',
               '2000-01-27', '2000-01-28', '2000-01-29', '2000-01-30',
               '2000-02-02', '2000-02-03', '2000-02-04', '2000-02-05',
               '2000-02-06', '2000-02-09', '2000-02-10', '2000-02-11',
               '2000-02-12', '2000-02-13'],
              dtype='datetime64[ns]', freq=None)
In [84]:
df1['A']
Out[84]:
2000-01-03   -0.144370
2000-01-04   -2.145394
2000-01-05    0.282223
2000-01-06   -0.199454
2000-01-07    0.892563
2000-01-10    1.952667
2000-01-11    1.182459
2000-01-12   -0.367848
2000-01-13    0.023308
2000-01-14    1.410542
2000-01-17    0.783971
2000-01-18   -1.361430
2000-01-19    1.346466
2000-01-20    0.347398
2000-01-21   -1.510918
2000-01-24   -0.307359
2000-01-25    0.987366
2000-01-26    0.187069
2000-01-27   -1.191208
2000-01-28   -1.403417
2000-01-31   -0.786557
2000-02-01   -0.541403
2000-02-02   -1.331567
2000-02-03    0.665224
2000-02-04    2.076048
2000-02-07    0.936564
2000-02-08   -0.737788
2000-02-09    1.030261
2000-02-10    0.404236
2000-02-11    0.498774
Freq: B, Name: A, dtype: float64
In [85]:
df1['A'].shift()
Out[85]:
2000-01-03         NaN
2000-01-04   -0.144370
2000-01-05   -2.145394
2000-01-06    0.282223
2000-01-07   -0.199454
2000-01-10    0.892563
2000-01-11    1.952667
2000-01-12    1.182459
2000-01-13   -0.367848
2000-01-14    0.023308
2000-01-17    1.410542
2000-01-18    0.783971
2000-01-19   -1.361430
2000-01-20    1.346466
2000-01-21    0.347398
2000-01-24   -1.510918
2000-01-25   -0.307359
2000-01-26    0.987366
2000-01-27    0.187069
2000-01-28   -1.191208
2000-01-31   -1.403417
2000-02-01   -0.786557
2000-02-02   -0.541403
2000-02-03   -1.331567
2000-02-04    0.665224
2000-02-07    2.076048
2000-02-08    0.936564
2000-02-09   -0.737788
2000-02-10    1.030261
2000-02-11    0.404236
Freq: B, Name: A, dtype: float64
In [86]:
a = df1.index.shift() - df1.index
In [87]:
a
Out[87]:
TimedeltaIndex(['1 days', '1 days', '1 days', '1 days', '3 days', '1 days',
                '1 days', '1 days', '1 days', '3 days', '1 days', '1 days',
                '1 days', '1 days', '3 days', '1 days', '1 days', '1 days',
                '1 days', '3 days', '1 days', '1 days', '1 days', '1 days',
                '3 days', '1 days', '1 days', '1 days', '1 days', '3 days'],
               dtype='timedelta64[ns]', freq=None)
In [88]:
df1.index.values # index中内部的保存数据类型。包装成index后有对应于index的其他操作
Out[88]:
array(['2000-01-03T00:00:00.000000000', '2000-01-04T00:00:00.000000000',
       '2000-01-05T00:00:00.000000000', '2000-01-06T00:00:00.000000000',
       '2000-01-07T00:00:00.000000000', '2000-01-10T00:00:00.000000000',
       '2000-01-11T00:00:00.000000000', '2000-01-12T00:00:00.000000000',
       '2000-01-13T00:00:00.000000000', '2000-01-14T00:00:00.000000000',
       '2000-01-17T00:00:00.000000000', '2000-01-18T00:00:00.000000000',
       '2000-01-19T00:00:00.000000000', '2000-01-20T00:00:00.000000000',
       '2000-01-21T00:00:00.000000000', '2000-01-24T00:00:00.000000000',
       '2000-01-25T00:00:00.000000000', '2000-01-26T00:00:00.000000000',
       '2000-01-27T00:00:00.000000000', '2000-01-28T00:00:00.000000000',
       '2000-01-31T00:00:00.000000000', '2000-02-01T00:00:00.000000000',
       '2000-02-02T00:00:00.000000000', '2000-02-03T00:00:00.000000000',
       '2000-02-04T00:00:00.000000000', '2000-02-07T00:00:00.000000000',
       '2000-02-08T00:00:00.000000000', '2000-02-09T00:00:00.000000000',
       '2000-02-10T00:00:00.000000000', '2000-02-11T00:00:00.000000000'],
      dtype='datetime64[ns]')
In [89]:
df1.index.values.to_period()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[89], line 1
----> 1 df1.index.values.to_period()

AttributeError: 'numpy.ndarray' object has no attribute 'to_period'
In [90]:
df1.index.to_period()
Out[90]:
PeriodIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
             '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',
             '2000-01-13', '2000-01-14', '2000-01-17', '2000-01-18',
             '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-24',
             '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
             '2000-01-31', '2000-02-01', '2000-02-02', '2000-02-03',
             '2000-02-04', '2000-02-07', '2000-02-08', '2000-02-09',
             '2000-02-10', '2000-02-11'],
            dtype='period[B]')
In [91]:
df2.index
Out[91]:
PeriodIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
             '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',
             '2000-01-13', '2000-01-14', '2000-01-17', '2000-01-18',
             '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-24',
             '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
             '2000-01-31', '2000-02-01', '2000-02-02', '2000-02-03',
             '2000-02-04', '2000-02-07', '2000-02-08', '2000-02-09',
             '2000-02-10', '2000-02-11'],
            dtype='period[B]')
In [92]:
df2.index.values
Out[92]:
array([Period('2000-01-03', 'B'), Period('2000-01-04', 'B'),
       Period('2000-01-05', 'B'), Period('2000-01-06', 'B'),
       Period('2000-01-07', 'B'), Period('2000-01-10', 'B'),
       Period('2000-01-11', 'B'), Period('2000-01-12', 'B'),
       Period('2000-01-13', 'B'), Period('2000-01-14', 'B'),
       Period('2000-01-17', 'B'), Period('2000-01-18', 'B'),
       Period('2000-01-19', 'B'), Period('2000-01-20', 'B'),
       Period('2000-01-21', 'B'), Period('2000-01-24', 'B'),
       Period('2000-01-25', 'B'), Period('2000-01-26', 'B'),
       Period('2000-01-27', 'B'), Period('2000-01-28', 'B'),
       Period('2000-01-31', 'B'), Period('2000-02-01', 'B'),
       Period('2000-02-02', 'B'), Period('2000-02-03', 'B'),
       Period('2000-02-04', 'B'), Period('2000-02-07', 'B'),
       Period('2000-02-08', 'B'), Period('2000-02-09', 'B'),
       Period('2000-02-10', 'B'), Period('2000-02-11', 'B')], dtype=object)
In [93]:
df2.index.week
Out[93]:
Int64Index([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5,
            5, 5, 5, 6, 6, 6, 6, 6],
           dtype='int64')
In [94]:
df2.index.day_of_week
Out[94]:
Int64Index([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1,
            2, 3, 4, 0, 1, 2, 3, 4],
           dtype='int64')
In [95]:
df2.index.weekofyear
Out[95]:
Int64Index([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5,
            5, 5, 5, 6, 6, 6, 6, 6],
           dtype='int64')
In [96]:
df2.index.to_timestamp()
Out[96]:
DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
               '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',
               '2000-01-13', '2000-01-14', '2000-01-17', '2000-01-18',
               '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-24',
               '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
               '2000-01-31', '2000-02-01', '2000-02-02', '2000-02-03',
               '2000-02-04', '2000-02-07', '2000-02-08', '2000-02-09',
               '2000-02-10', '2000-02-11'],
              dtype='datetime64[ns]', freq='B')
In [97]:
df2.index.shift() - df2.index
Out[97]:
Index([<BusinessDay>, <BusinessDay>, <BusinessDay>, <BusinessDay>,
       <BusinessDay>, <BusinessDay>, <BusinessDay>, <BusinessDay>,
       <BusinessDay>, <BusinessDay>, <BusinessDay>, <BusinessDay>,
       <BusinessDay>, <BusinessDay>, <BusinessDay>, <BusinessDay>,
       <BusinessDay>, <BusinessDay>, <BusinessDay>, <BusinessDay>,
       <BusinessDay>, <BusinessDay>, <BusinessDay>, <BusinessDay>,
       <BusinessDay>, <BusinessDay>, <BusinessDay>, <BusinessDay>,
       <BusinessDay>, <BusinessDay>],
      dtype='object')
In [98]:
df2.index[1:].append(pd.Index([np.nan])) # 因为shift直接平移日期。若需直接平移数值,可用这个替代。
Out[98]:
PeriodIndex(['2000-01-04', '2000-01-05', '2000-01-06', '2000-01-07',
             '2000-01-10', '2000-01-11', '2000-01-12', '2000-01-13',
             '2000-01-14', '2000-01-17', '2000-01-18', '2000-01-19',
             '2000-01-20', '2000-01-21', '2000-01-24', '2000-01-25',
             '2000-01-26', '2000-01-27', '2000-01-28', '2000-01-31',
             '2000-02-01', '2000-02-02', '2000-02-03', '2000-02-04',
             '2000-02-07', '2000-02-08', '2000-02-09', '2000-02-10',
             '2000-02-11',        'NaT'],
            dtype='period[B]')
In [99]:
df2.index[1:].append(pd.Index([np.nan])).asfreq('D') - df2.index.asfreq('D')
Out[99]:
Index([     <Day>,      <Day>,      <Day>,      <Day>, <3 * Days>,      <Day>,
            <Day>,      <Day>,      <Day>, <3 * Days>,      <Day>,      <Day>,
            <Day>,      <Day>, <3 * Days>,      <Day>,      <Day>,      <Day>,
            <Day>, <3 * Days>,      <Day>,      <Day>,      <Day>,      <Day>,
       <3 * Days>,      <Day>,      <Day>,      <Day>,      <Day>,        NaT],
      dtype='object')

read data¶

In [100]:
df = pd.read_pickle('../../../data/stk_df_2015_2021.pkl')
In [101]:
df
Out[101]:
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 000001.XSHE 平安银行 XSHE 2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 000001.XSHE 平安银行 XSHE 2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 000001.XSHE 平安银行 XSHE 2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 000001.XSHE 平安银行 XSHE 2015-01-08 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 000001.XSHE 平安银行 XSHE 2015-01-09 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
5838745 900957.XSHG 凌云B股 XSHG 2021-12-27 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
5838746 900957.XSHG 凌云B股 XSHG 2021-12-28 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
5838747 900957.XSHG 凌云B股 XSHG 2021-12-29 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
5838748 900957.XSHG 凌云B股 XSHG 2021-12-30 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
5838749 900957.XSHG 凌云B股 XSHG 2021-12-31 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 12 columns

In [102]:
display(df)
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 000001.XSHE 平安银行 XSHE 2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 000001.XSHE 平安银行 XSHE 2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 000001.XSHE 平安银行 XSHE 2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 000001.XSHE 平安银行 XSHE 2015-01-08 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 000001.XSHE 平安银行 XSHE 2015-01-09 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
5838745 900957.XSHG 凌云B股 XSHG 2021-12-27 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
5838746 900957.XSHG 凌云B股 XSHG 2021-12-28 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
5838747 900957.XSHG 凌云B股 XSHG 2021-12-29 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
5838748 900957.XSHG 凌云B股 XSHG 2021-12-30 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
5838749 900957.XSHG 凌云B股 XSHG 2021-12-31 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 12 columns

In [103]:
temp = pd.read_csv('../../../data/idx_df_csv.csv',dtype={'ticker':str})
temp.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1365 entries, 0 to 1364
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ticker         1365 non-null   object 
 1   secShortName   1365 non-null   object 
 2   tradeDate      1365 non-null   object 
 3   preCloseIndex  1365 non-null   float64
 4   openIndex      1365 non-null   float64
 5   highestIndex   1365 non-null   float64
 6   lowestIndex    1365 non-null   float64
 7   closeIndex     1365 non-null   float64
 8   turnoverVol    1365 non-null   int64  
dtypes: float64(5), int64(1), object(3)
memory usage: 96.1+ KB
In [104]:
temp
Out[104]:
ticker secShortName tradeDate preCloseIndex openIndex highestIndex lowestIndex closeIndex turnoverVol
0 000001 上证综指 2021-01-04 3473.0693 3474.6793 3511.6554 3457.2061 3502.9584 38079080000
1 000001 上证综指 2021-01-05 3502.9584 3492.1912 3528.6767 3484.7151 3528.6767 40799593400
2 000001 上证综指 2021-01-06 3528.6767 3530.9072 3556.8022 3513.1262 3550.8767 37023092600
3 000001 上证综指 2021-01-07 3550.8767 3552.9087 3576.2046 3526.6174 3576.2046 40534822600
4 000001 上证综指 2021-01-08 3576.2046 3577.6923 3588.0625 3544.8912 3570.1082 34555789600
... ... ... ... ... ... ... ... ... ...
1360 399106 深证综指 2022-02-15 2253.1310 2256.8670 2283.6760 2251.3000 2283.6300 37139160803
1361 399106 深证综指 2022-02-16 2283.6300 2293.0800 2304.2800 2285.6750 2296.9930 37530877510
1362 399106 深证综指 2022-02-17 2296.9930 2293.4480 2314.8270 2287.8960 2302.1620 42928561256
1363 399106 深证综指 2022-02-18 2302.1620 2287.2640 2311.8220 2284.1220 2311.7900 38837049683
1364 399106 深证综指 2022-02-21 2311.7900 2313.5060 2325.9670 2309.7340 2325.7960 44801214227

1365 rows × 9 columns

In [105]:
# 如果读取csv或其他类型的文件, 可能会遇到一些格式转换的问题
pd.read_csv('../../../data/idx_df_csv.csv')
Out[105]:
ticker secShortName tradeDate preCloseIndex openIndex highestIndex lowestIndex closeIndex turnoverVol
0 1 上证综指 2021-01-04 3473.0693 3474.6793 3511.6554 3457.2061 3502.9584 38079080000
1 1 上证综指 2021-01-05 3502.9584 3492.1912 3528.6767 3484.7151 3528.6767 40799593400
2 1 上证综指 2021-01-06 3528.6767 3530.9072 3556.8022 3513.1262 3550.8767 37023092600
3 1 上证综指 2021-01-07 3550.8767 3552.9087 3576.2046 3526.6174 3576.2046 40534822600
4 1 上证综指 2021-01-08 3576.2046 3577.6923 3588.0625 3544.8912 3570.1082 34555789600
... ... ... ... ... ... ... ... ... ...
1360 399106 深证综指 2022-02-15 2253.1310 2256.8670 2283.6760 2251.3000 2283.6300 37139160803
1361 399106 深证综指 2022-02-16 2283.6300 2293.0800 2304.2800 2285.6750 2296.9930 37530877510
1362 399106 深证综指 2022-02-17 2296.9930 2293.4480 2314.8270 2287.8960 2302.1620 42928561256
1363 399106 深证综指 2022-02-18 2302.1620 2287.2640 2311.8220 2284.1220 2311.7900 38837049683
1364 399106 深证综指 2022-02-21 2311.7900 2313.5060 2325.9670 2309.7340 2325.7960 44801214227

1365 rows × 9 columns

Quick summary¶

In [106]:
df.head(10)
Out[106]:
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 000001.XSHE 平安银行 XSHE 2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 000001.XSHE 平安银行 XSHE 2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 000001.XSHE 平安银行 XSHE 2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 000001.XSHE 平安银行 XSHE 2015-01-08 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 000001.XSHE 平安银行 XSHE 2015-01-09 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
5 000001.XSHE 平安银行 XSHE 2015-01-12 1231.004 1205.698 2696688 2.293105e+09 64376.0 0.0158 1.452882e+11 1.687457e+11
6 000001.XSHE 平安银行 XSHE 2015-01-13 1205.698 1198.351 1418187 1.204987e+09 34924.0 0.0083 1.444029e+11 1.677175e+11
7 000001.XSHE 平安银行 XSHE 2015-01-14 1198.351 1208.963 2192762 1.889297e+09 50675.0 0.0128 1.456817e+11 1.692027e+11
8 000001.XSHE 平安银行 XSHE 2015-01-15 1208.963 1253.044 2156548 1.868796e+09 54470.0 0.0126 1.509935e+11 1.753721e+11
9 000001.XSHE 平安银行 XSHE 2015-01-16 1253.044 1254.677 2701125 2.403346e+09 60637.0 0.0158 1.511903e+11 1.756006e+11
In [107]:
df.tail()
Out[107]:
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
5838745 900957.XSHG 凌云B股 XSHG 2021-12-27 0.625 0.638 488511 308154.0 226.0 0.0027 116472000.0 220917000.0
5838746 900957.XSHG 凌云B股 XSHG 2021-12-28 0.638 0.637 177702 111881.0 64.0 0.0010 116288000.0 220568000.0
5838747 900957.XSHG 凌云B股 XSHG 2021-12-29 0.637 0.630 123550 77333.0 58.0 0.0007 115000000.0 218125000.0
5838748 900957.XSHG 凌云B股 XSHG 2021-12-30 0.630 0.635 113600 71308.0 41.0 0.0006 115920000.0 219870000.0
5838749 900957.XSHG 凌云B股 XSHG 2021-12-31 0.635 0.636 167800 105996.0 74.0 0.0009 116104000.0 220219000.0
In [108]:
df.describe()
Out[108]:
preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
count 5.838750e+06 5.838750e+06 5.838750e+06 5.838750e+06 5.820845e+06 5.838750e+06 5.838750e+06 5.838750e+06
mean 1.368568e+02 1.368763e+02 5.525219e+06 2.008992e+08 1.078089e+04 2.986831e-02 1.324705e+10 1.835989e+10
std 2.086103e+03 2.085805e+03 2.405725e+07 4.866879e+08 1.874182e+04 4.649860e-02 5.686519e+10 7.522905e+10
min 3.800000e-02 3.800000e-02 2.000000e+00 1.800000e+01 1.000000e+00 0.000000e+00 3.800000e+06 1.045000e+07
25% 2.167700e+01 2.168600e+01 8.454962e+05 2.916335e+07 2.565000e+03 6.900000e-03 2.203200e+09 3.527868e+09
50% 3.972300e+01 3.973600e+01 1.864330e+06 7.414118e+07 5.410000e+03 1.510000e-02 4.304763e+09 6.153206e+09
75% 7.390000e+01 7.392400e+01 4.317285e+06 1.915373e+08 1.179500e+04 3.380000e-02 9.130280e+09 1.286893e+10
max 4.024782e+05 4.024782e+05 5.109897e+09 6.794128e+10 3.274456e+06 9.896000e-01 3.267370e+12 3.267370e+12
In [109]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5838750 entries, 0 to 5838749
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   secID           object 
 1   secShortName    object 
 2   exchangeCD      object 
 3   tradeDate       object 
 4   preClosePrice   float64
 5   closePrice      float64
 6   turnoverVol     int64  
 7   turnoverValue   float64
 8   dealAmount      float64
 9   turnoverRate    float64
 10  negMarketValue  float64
 11  marketValue     float64
dtypes: float64(7), int64(1), object(4)
memory usage: 534.6+ MB
In [110]:
df['secShortName'].nunique() # 多少只股票
Out[110]:
4853
In [111]:
df['turnoverValue'].mean()
Out[111]:
200899200.96656865
In [112]:
df.select_dtypes(include=np.number)
Out[112]:
preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ...
5838745 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
5838746 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
5838747 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
5838748 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
5838749 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 8 columns

In [113]:
df.select_dtypes(include=np.number).mean()
Out[113]:
preClosePrice     1.368568e+02
closePrice        1.368763e+02
turnoverVol       5.525219e+06
turnoverValue     2.008992e+08
dealAmount        1.078089e+04
turnoverRate      2.986831e-02
negMarketValue    1.324705e+10
marketValue       1.835989e+10
dtype: float64
In [114]:
df.select_dtypes(include=np.number).mean(axis=1)
Out[114]:
0          4.314767e+10
1          4.237068e+10
2          4.147094e+10
3          4.002552e+10
4          4.055811e+10
               ...     
5838745    4.227324e+07
5838746    4.214321e+07
5838747    4.166574e+07
5838748    4.199687e+07
5838749    4.207461e+07
Length: 5838750, dtype: float64
In [115]:
df_pingan = df[df['secShortName']=='平安银行'].copy()
In [116]:
df_pingan
Out[116]:
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 000001.XSHE 平安银行 XSHE 2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 000001.XSHE 平安银行 XSHE 2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 000001.XSHE 平安银行 XSHE 2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 000001.XSHE 平安银行 XSHE 2015-01-08 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 000001.XSHE 平安银行 XSHE 2015-01-09 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
1700 000001.XSHE 平安银行 XSHE 2021-12-27 2224.866 2213.298 881462 1.260455e+09 66911.0 0.0038 3.341671e+11 3.341699e+11
1701 000001.XSHE 平安银行 XSHE 2021-12-28 2213.298 2206.872 1358314 1.934461e+09 89671.0 0.0058 3.331968e+11 3.331996e+11
1702 000001.XSHE 平安银行 XSHE 2021-12-29 2206.872 2152.889 1771527 2.480535e+09 147243.0 0.0076 3.250464e+11 3.250491e+11
1703 000001.XSHE 平安银行 XSHE 2021-12-30 2152.889 2161.886 960485 1.342374e+09 66792.0 0.0041 3.264048e+11 3.264075e+11
1704 000001.XSHE 平安银行 XSHE 2021-12-31 2161.886 2118.185 2110777 2.899617e+09 152423.0 0.0090 3.198068e+11 3.198095e+11

1705 rows × 12 columns

object vs. str¶

object和str有什么区别:

  • object 类型的字符串长度可变(字符串的bytes长度不一)
  • str 类型的字符串长度统一(用0填充或截断)

一些细节:numpy 中允许 object 和 str 两种字符串的格式。但pandas没有采用numpy的这种格式,而是用原生python的字符串格式。numpy的格式更为复杂。例:(来自https://stackoverflow.com/questions/34881079/pandas-distinction-between-str-and-object-types)

In [117]:
x = np.array(['Testing', 'a', 'string'], dtype='|S7') # 长度为7的字符串。|是byteorder指示符
y = np.array(['Testing', 'a', 'string'], dtype=object)

byteorder: https://numpy.org/doc/stable/reference/generated/numpy.dtype.byteorder.html#numpy.dtype.byteorder

In [118]:
x[1] = 'a really really really long string'
x
Out[118]:
array([b'Testing', b'a reall', b'string'], dtype='|S7')
In [119]:
y[1] = 'a really really really long string'
y
Out[119]:
array(['Testing', 'a really really really long string', 'string'],
      dtype=object)

pandas中不允许用固定长度的str

In [120]:
temp = pd.DataFrame({'a':range(5)})
In [121]:
temp['a'].dtype
Out[121]:
dtype('int64')
In [122]:
temp['a'].astype(object).dtype
Out[122]:
dtype('O')
In [123]:
temp['a'].astype(str).dtype
Out[123]:
dtype('O')

Sort, duplicates¶

In [124]:
temp = tm.makeTimeDataFrame(3)
display(temp)
A B C D
2000-01-03 -0.051528 -0.582815 0.392680 -1.395863
2000-01-04 0.538363 -1.165366 -0.394174 -0.829585
2000-01-05 0.952806 1.578885 -1.252698 0.734604
In [125]:
temp2 = temp.stack().to_frame().reset_index()
temp2.columns = ['date','variable','value']
In [126]:
temp2
Out[126]:
date variable value
0 2000-01-03 A -0.051528
1 2000-01-03 B -0.582815
2 2000-01-03 C 0.392680
3 2000-01-03 D -1.395863
4 2000-01-04 A 0.538363
5 2000-01-04 B -1.165366
6 2000-01-04 C -0.394174
7 2000-01-04 D -0.829585
8 2000-01-05 A 0.952806
9 2000-01-05 B 1.578885
10 2000-01-05 C -1.252698
11 2000-01-05 D 0.734604
In [127]:
temp2.sort_values(by='value')
Out[127]:
date variable value
3 2000-01-03 D -1.395863
10 2000-01-05 C -1.252698
5 2000-01-04 B -1.165366
7 2000-01-04 D -0.829585
1 2000-01-03 B -0.582815
6 2000-01-04 C -0.394174
0 2000-01-03 A -0.051528
2 2000-01-03 C 0.392680
4 2000-01-04 A 0.538363
11 2000-01-05 D 0.734604
8 2000-01-05 A 0.952806
9 2000-01-05 B 1.578885
In [128]:
temp2.sort_values(by='variable',ascending=False)
Out[128]:
date variable value
3 2000-01-03 D -1.395863
7 2000-01-04 D -0.829585
11 2000-01-05 D 0.734604
2 2000-01-03 C 0.392680
6 2000-01-04 C -0.394174
10 2000-01-05 C -1.252698
1 2000-01-03 B -0.582815
5 2000-01-04 B -1.165366
9 2000-01-05 B 1.578885
0 2000-01-03 A -0.051528
4 2000-01-04 A 0.538363
8 2000-01-05 A 0.952806
In [130]:
temp2.duplicated('variable')
Out[130]:
0     False
1     False
2     False
3     False
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
dtype: bool
In [131]:
temp2.drop_duplicates('variable')
Out[131]:
date variable value
0 2000-01-03 A -0.051528
1 2000-01-03 B -0.582815
2 2000-01-03 C 0.392680
3 2000-01-03 D -1.395863
In [132]:
temp2.drop_duplicates('variable',keep='last')
Out[132]:
date variable value
8 2000-01-05 A 0.952806
9 2000-01-05 B 1.578885
10 2000-01-05 C -1.252698
11 2000-01-05 D 0.734604

Categorizing values¶

In [133]:
temp = tm.makeDataFrame()
temp['A'] = np.random.randint(0, 100, size=temp.shape[0])
In [134]:
temp.drop(['B','C','D'],axis=1, inplace=True)
In [135]:
temp.columns = ['score']
In [136]:
temp
Out[136]:
score
leMJDtok1y 70
tHQMKBjt4g 87
UHQZfBqS9C 29
HnNcYZiPS3 27
Q8E0jeFgcZ 63
V3O7nacCx6 5
A21m7MIYO0 93
jPRB2EYMJ9 6
lyVA6FSPDQ 35
xRVJg7Noqk 88
GWdYMjt64S 51
tsy4vUUXuW 37
aqO7XSrLqj 75
ozmzHJ0hio 48
R5iMCRIpOp 51
f2WbBxzk3u 53
NmyCgrqM22 79
kn8e4znWFQ 73
61oSoW6COF 39
slhMjph9xG 62
P6px9D4zmM 1
hZPrwuiO1U 28
dY9aIT4EX3 59
eT1vVe47Lq 85
7HcEWFCsMB 83
3Wj1dUlcme 62
AuRMTRsaSF 90
R7dGWFgtRe 92
Y5Y887ynFq 27
xWjpVzhPye 61
In [137]:
?pd.cut
Signature:
pd.cut(
    x,
    bins,
    right: 'bool' = True,
    labels=None,
    retbins: 'bool' = False,
    precision: 'int' = 3,
    include_lowest: 'bool' = False,
    duplicates: 'str' = 'raise',
    ordered: 'bool' = True,
)
Docstring:
Bin values into discrete intervals.

Use `cut` when you need to segment and sort data values into bins. This
function is also useful for going from a continuous variable to a
categorical variable. For example, `cut` could convert ages to groups of
age ranges. Supports binning into an equal number of bins, or a
pre-specified array of bins.

Parameters
----------
x : array-like
    The input array to be binned. Must be 1-dimensional.
bins : int, sequence of scalars, or IntervalIndex
    The criteria to bin by.

    * int : Defines the number of equal-width bins in the range of `x`. The
      range of `x` is extended by .1% on each side to include the minimum
      and maximum values of `x`.
    * sequence of scalars : Defines the bin edges allowing for non-uniform
      width. No extension of the range of `x` is done.
    * IntervalIndex : Defines the exact bins to be used. Note that
      IntervalIndex for `bins` must be non-overlapping.

right : bool, default True
    Indicates whether `bins` includes the rightmost edge or not. If
    ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
    indicate (1,2], (2,3], (3,4]. This argument is ignored when
    `bins` is an IntervalIndex.
labels : array or False, default None
    Specifies the labels for the returned bins. Must be the same length as
    the resulting bins. If False, returns only integer indicators of the
    bins. This affects the type of the output container (see below).
    This argument is ignored when `bins` is an IntervalIndex. If True,
    raises an error. When `ordered=False`, labels must be provided.
retbins : bool, default False
    Whether to return the bins or not. Useful when bins is provided
    as a scalar.
precision : int, default 3
    The precision at which to store and display the bins labels.
include_lowest : bool, default False
    Whether the first interval should be left-inclusive or not.
duplicates : {default 'raise', 'drop'}, optional
    If bin edges are not unique, raise ValueError or drop non-uniques.
ordered : bool, default True
    Whether the labels are ordered or not. Applies to returned types
    Categorical and Series (with Categorical dtype). If True,
    the resulting categorical will be ordered. If False, the resulting
    categorical will be unordered (labels must be provided).

    .. versionadded:: 1.1.0

Returns
-------
out : Categorical, Series, or ndarray
    An array-like object representing the respective bin for each value
    of `x`. The type depends on the value of `labels`.

    * None (default) : returns a Series for Series `x` or a
      Categorical for all other inputs. The values stored within
      are Interval dtype.

    * sequence of scalars : returns a Series for Series `x` or a
      Categorical for all other inputs. The values stored within
      are whatever the type in the sequence is.

    * False : returns an ndarray of integers.

bins : numpy.ndarray or IntervalIndex.
    The computed or specified bins. Only returned when `retbins=True`.
    For scalar or sequence `bins`, this is an ndarray with the computed
    bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
    an IntervalIndex `bins`, this is equal to `bins`.

See Also
--------
qcut : Discretize variable into equal-sized buckets based on rank
    or based on sample quantiles.
Categorical : Array type for storing data that come from a
    fixed set of values.
Series : One-dimensional array with axis labels (including time series).
IntervalIndex : Immutable Index implementing an ordered, sliceable set.

Notes
-----
Any NA values will be NA in the result. Out of bounds values will be NA in
the resulting Series or Categorical object.

Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.

Examples
--------
Discretize into three equal-sized bins.

>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
... # doctest: +ELLIPSIS
[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...

>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
... # doctest: +ELLIPSIS
([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
array([0.994, 3.   , 5.   , 7.   ]))

Discovers the same bins, but assign them specific labels. Notice that
the returned Categorical's categories are `labels` and is ordered.

>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
...        3, labels=["bad", "medium", "good"])
['bad', 'good', 'medium', 'medium', 'good', 'bad']
Categories (3, object): ['bad' < 'medium' < 'good']

``ordered=False`` will result in unordered categories when labels are passed.
This parameter can be used to allow non-unique labels:

>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
...        labels=["B", "A", "B"], ordered=False)
['B', 'B', 'A', 'A', 'B', 'B']
Categories (2, object): ['A', 'B']

``labels=False`` implies you just want the bins back.

>>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
array([0, 1, 1, 3])

Passing a Series as an input returns a Series with categorical dtype:

>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
...               index=['a', 'b', 'c', 'd', 'e'])
>>> pd.cut(s, 3)
... # doctest: +ELLIPSIS
a    (1.992, 4.667]
b    (1.992, 4.667]
c    (4.667, 7.333]
d     (7.333, 10.0]
e     (7.333, 10.0]
dtype: category
Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...

Passing a Series as an input returns a Series with mapping value.
It is used to map numerically to intervals based on bins.

>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
...               index=['a', 'b', 'c', 'd', 'e'])
>>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
... # doctest: +ELLIPSIS
(a    1.0
 b    2.0
 c    3.0
 d    4.0
 e    NaN
 dtype: float64,
 array([ 0,  2,  4,  6,  8, 10]))

Use `drop` optional when bins is not unique

>>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
...        right=False, duplicates='drop')
... # doctest: +ELLIPSIS
(a    1.0
 b    2.0
 c    3.0
 d    3.0
 e    NaN
 dtype: float64,
 array([ 0,  2,  4,  6, 10]))

Passing an IntervalIndex for `bins` results in those categories exactly.
Notice that values not covered by the IntervalIndex are set to NaN. 0
is to the left of the first bin (which is closed on the right), and 1.5
falls between two bins.

>>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
[NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
File:      ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/reshape/tile.py
Type:      function
In [138]:
temp['score'][3] = 0
In [139]:
bins = [0, 25, 50, 75, 100]
pd.cut(temp['score'], bins=bins, labels=['D','C','B','A'])
Out[139]:
leMJDtok1y      B
tHQMKBjt4g      A
UHQZfBqS9C      C
HnNcYZiPS3    NaN
Q8E0jeFgcZ      B
V3O7nacCx6      D
A21m7MIYO0      A
jPRB2EYMJ9      D
lyVA6FSPDQ      C
xRVJg7Noqk      A
GWdYMjt64S      B
tsy4vUUXuW      C
aqO7XSrLqj      B
ozmzHJ0hio      C
R5iMCRIpOp      B
f2WbBxzk3u      B
NmyCgrqM22      A
kn8e4znWFQ      B
61oSoW6COF      C
slhMjph9xG      B
P6px9D4zmM      D
hZPrwuiO1U      C
dY9aIT4EX3      B
eT1vVe47Lq      A
7HcEWFCsMB      A
3Wj1dUlcme      B
AuRMTRsaSF      A
R7dGWFgtRe      A
Y5Y887ynFq      C
xWjpVzhPye      B
Name: score, dtype: category
Categories (4, object): ['D' < 'C' < 'B' < 'A']
In [140]:
bins = [0, 25, 50, 75, 100]
pd.cut(temp['score'], bins=bins, labels=['D','C','B','A'],include_lowest=True)
Out[140]:
leMJDtok1y    B
tHQMKBjt4g    A
UHQZfBqS9C    C
HnNcYZiPS3    D
Q8E0jeFgcZ    B
V3O7nacCx6    D
A21m7MIYO0    A
jPRB2EYMJ9    D
lyVA6FSPDQ    C
xRVJg7Noqk    A
GWdYMjt64S    B
tsy4vUUXuW    C
aqO7XSrLqj    B
ozmzHJ0hio    C
R5iMCRIpOp    B
f2WbBxzk3u    B
NmyCgrqM22    A
kn8e4znWFQ    B
61oSoW6COF    C
slhMjph9xG    B
P6px9D4zmM    D
hZPrwuiO1U    C
dY9aIT4EX3    B
eT1vVe47Lq    A
7HcEWFCsMB    A
3Wj1dUlcme    B
AuRMTRsaSF    A
R7dGWFgtRe    A
Y5Y887ynFq    C
xWjpVzhPye    B
Name: score, dtype: category
Categories (4, object): ['D' < 'C' < 'B' < 'A']

datetime type, time series as index¶

Useful resources: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

In [141]:
df_pingan
Out[141]:
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 000001.XSHE 平安银行 XSHE 2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 000001.XSHE 平安银行 XSHE 2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 000001.XSHE 平安银行 XSHE 2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 000001.XSHE 平安银行 XSHE 2015-01-08 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 000001.XSHE 平安银行 XSHE 2015-01-09 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
1700 000001.XSHE 平安银行 XSHE 2021-12-27 2224.866 2213.298 881462 1.260455e+09 66911.0 0.0038 3.341671e+11 3.341699e+11
1701 000001.XSHE 平安银行 XSHE 2021-12-28 2213.298 2206.872 1358314 1.934461e+09 89671.0 0.0058 3.331968e+11 3.331996e+11
1702 000001.XSHE 平安银行 XSHE 2021-12-29 2206.872 2152.889 1771527 2.480535e+09 147243.0 0.0076 3.250464e+11 3.250491e+11
1703 000001.XSHE 平安银行 XSHE 2021-12-30 2152.889 2161.886 960485 1.342374e+09 66792.0 0.0041 3.264048e+11 3.264075e+11
1704 000001.XSHE 平安银行 XSHE 2021-12-31 2161.886 2118.185 2110777 2.899617e+09 152423.0 0.0090 3.198068e+11 3.198095e+11

1705 rows × 12 columns

In [142]:
df_pingan.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1705 entries, 0 to 1704
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   secID           1705 non-null   object 
 1   secShortName    1705 non-null   object 
 2   exchangeCD      1705 non-null   object 
 3   tradeDate       1705 non-null   object 
 4   preClosePrice   1705 non-null   float64
 5   closePrice      1705 non-null   float64
 6   turnoverVol     1705 non-null   int64  
 7   turnoverValue   1705 non-null   float64
 8   dealAmount      1705 non-null   float64
 9   turnoverRate    1705 non-null   float64
 10  negMarketValue  1705 non-null   float64
 11  marketValue     1705 non-null   float64
dtypes: float64(7), int64(1), object(4)
memory usage: 173.2+ KB
In [143]:
df_pingan['tradeDate'] = pd.to_datetime(df_pingan['tradeDate'], format='%Y-%m-%d')
In [144]:
df_pingan.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1705 entries, 0 to 1704
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   secID           1705 non-null   object        
 1   secShortName    1705 non-null   object        
 2   exchangeCD      1705 non-null   object        
 3   tradeDate       1705 non-null   datetime64[ns]
 4   preClosePrice   1705 non-null   float64       
 5   closePrice      1705 non-null   float64       
 6   turnoverVol     1705 non-null   int64         
 7   turnoverValue   1705 non-null   float64       
 8   dealAmount      1705 non-null   float64       
 9   turnoverRate    1705 non-null   float64       
 10  negMarketValue  1705 non-null   float64       
 11  marketValue     1705 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int64(1), object(3)
memory usage: 173.2+ KB
In [145]:
df_pingan.set_index('tradeDate', inplace=True)
In [146]:
df_pingan
Out[146]:
secID secShortName exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2015-01-05 000001.XSHE 平安银行 XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 000001.XSHE 平安银行 XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 000001.XSHE 平安银行 XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 000001.XSHE 平安银行 XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 000001.XSHE 平安银行 XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ...
2021-12-27 000001.XSHE 平安银行 XSHE 2224.866 2213.298 881462 1.260455e+09 66911.0 0.0038 3.341671e+11 3.341699e+11
2021-12-28 000001.XSHE 平安银行 XSHE 2213.298 2206.872 1358314 1.934461e+09 89671.0 0.0058 3.331968e+11 3.331996e+11
2021-12-29 000001.XSHE 平安银行 XSHE 2206.872 2152.889 1771527 2.480535e+09 147243.0 0.0076 3.250464e+11 3.250491e+11
2021-12-30 000001.XSHE 平安银行 XSHE 2152.889 2161.886 960485 1.342374e+09 66792.0 0.0041 3.264048e+11 3.264075e+11
2021-12-31 000001.XSHE 平安银行 XSHE 2161.886 2118.185 2110777 2.899617e+09 152423.0 0.0090 3.198068e+11 3.198095e+11

1705 rows × 11 columns

BM: Business Month

In [147]:
df_pingan.resample('BM').last()
Out[147]:
secID secShortName exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2015-01-30 000001.XSHE 平安银行 XSHE 1134.678 1137.127 1614788 1.298736e+09 42061.0 0.0095 1.370254e+11 1.591488e+11
2015-02-27 000001.XSHE 平安银行 XSHE 1148.556 1142.025 1377727 1.115146e+09 36250.0 0.0081 1.376156e+11 1.598343e+11
2015-03-31 000001.XSHE 平安银行 XSHE 1278.350 1285.697 4745114 4.393135e+09 114032.0 0.0278 1.549282e+11 1.799421e+11
2015-04-30 000001.XSHE 平安银行 XSHE 1645.458 1650.399 4182058 4.899380e+09 120845.0 0.0245 1.971277e+11 2.289549e+11
2015-05-29 000001.XSHE 平安银行 XSHE 1531.808 1514.019 2915126 3.103544e+09 99260.0 0.0171 1.808381e+11 2.192089e+11
... ... ... ... ... ... ... ... ... ... ... ...
2021-08-31 000001.XSHE 平安银行 XSHE 2277.563 2287.846 1416461 2.072007e+09 119791.0 0.0061 3.454224e+11 3.454253e+11
2021-09-30 000001.XSHE 平安银行 XSHE 2332.832 2304.555 959216 1.424676e+09 70140.0 0.0041 3.479452e+11 3.479481e+11
2021-10-29 000001.XSHE 平安银行 XSHE 2534.625 2506.348 650958 1.052760e+09 76331.0 0.0028 3.784122e+11 3.784154e+11
2021-11-30 000001.XSHE 平安银行 XSHE 2250.572 2241.575 884472 1.280385e+09 65260.0 0.0038 3.384364e+11 3.384392e+11
2021-12-31 000001.XSHE 平安银行 XSHE 2161.886 2118.185 2110777 2.899617e+09 152423.0 0.0090 3.198068e+11 3.198095e+11

84 rows × 11 columns

In [148]:
df_pingan.resample('M').mean(numeric_only=True)
Out[148]:
preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2015-01-31 1204.922300 1197.126450 2.685951e+06 2.293232e+09 64189.250000 0.015725 1.442554e+11 1.675461e+11
2015-02-28 1127.222600 1127.549133 1.506542e+06 1.202904e+09 37079.066667 0.008827 1.358712e+11 1.578083e+11
2015-03-31 1193.861227 1200.391773 2.966580e+06 2.552488e+09 75256.318182 0.017386 1.446489e+11 1.680031e+11
2015-04-30 1543.791762 1561.158524 4.287529e+06 4.712147e+09 138826.190476 0.025114 1.869631e+11 2.171492e+11
2015-05-31 1577.762100 1570.943100 2.897439e+06 3.198760e+09 96879.850000 0.016965 1.876373e+11 2.213028e+11
... ... ... ... ... ... ... ... ...
2021-08-31 2440.388364 2441.031045 1.143970e+06 1.800585e+09 96686.318182 0.004891 3.685506e+11 3.685537e+11
2021-09-30 2390.798950 2391.634400 1.659547e+06 2.550127e+09 126351.950000 0.007095 3.610926e+11 3.610956e+11
2021-10-31 2504.098750 2516.710812 1.081450e+06 1.752654e+09 97972.312500 0.004625 3.799768e+11 3.799800e+11
2021-11-30 2318.985409 2306.950273 1.188170e+06 1.768122e+09 94558.681818 0.005073 3.483068e+11 3.483098e+11
2021-12-31 2270.019130 2264.654348 1.342157e+06 1.968544e+09 96840.304348 0.005735 3.419210e+11 3.419238e+11

84 rows × 8 columns

In [149]:
df_pingan.loc['2015']
Out[149]:
secID secShortName exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2015-01-05 000001.XSHE 平安银行 XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 000001.XSHE 平安银行 XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 000001.XSHE 平安银行 XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 000001.XSHE 平安银行 XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 000001.XSHE 平安银行 XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ...
2015-12-25 000001.XSHE 平安银行 XSHE 1219.517 1226.435 578480 4.956381e+08 19587.0 0.0034 1.464883e+11 1.775707e+11
2015-12-28 000001.XSHE 平安银行 XSHE 1226.435 1183.939 1189828 1.003010e+09 33043.0 0.0070 1.414126e+11 1.714179e+11
2015-12-29 000001.XSHE 平安银行 XSHE 1183.939 1194.810 896705 7.447184e+08 20170.0 0.0053 1.427110e+11 1.729919e+11
2015-12-30 000001.XSHE 平安银行 XSHE 1194.810 1195.798 770642 6.412773e+08 18064.0 0.0045 1.428291e+11 1.731350e+11
2015-12-31 000001.XSHE 平安银行 XSHE 1195.798 1184.928 710734 5.916436e+08 14765.0 0.0042 1.415306e+11 1.715610e+11

244 rows × 11 columns

In [150]:
df_pingan.loc['2015-01':'2018-03']
Out[150]:
secID secShortName exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2015-01-05 000001.XSHE 平安银行 XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 000001.XSHE 平安银行 XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 000001.XSHE 平安银行 XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 000001.XSHE 平安银行 XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 000001.XSHE 平安银行 XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ...
2018-03-26 000001.XSHE 平安银行 XSHE 1384.775 1334.708 1668113 1.519435e+09 63646.0 0.0082 1.849136e+11 1.876726e+11
2018-03-27 000001.XSHE 平安银行 XSHE 1334.708 1335.929 1330940 1.213990e+09 49311.0 0.0065 1.850827e+11 1.878443e+11
2018-03-28 000001.XSHE 平安银行 XSHE 1335.929 1329.824 1325020 1.201023e+09 45201.0 0.0065 1.842368e+11 1.869858e+11
2018-03-29 000001.XSHE 平安银行 XSHE 1329.824 1349.362 1604219 1.445282e+09 61870.0 0.0079 1.869437e+11 1.897330e+11
2018-03-30 000001.XSHE 平安银行 XSHE 1349.362 1331.045 906846 8.234382e+08 31869.0 0.0044 1.844060e+11 1.871575e+11

791 rows × 11 columns

apply, groupby apply¶

注意:pandas中的apply操作有时会很慢(包括groupby apply)

In [151]:
df
Out[151]:
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 000001.XSHE 平安银行 XSHE 2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 000001.XSHE 平安银行 XSHE 2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 000001.XSHE 平安银行 XSHE 2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 000001.XSHE 平安银行 XSHE 2015-01-08 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 000001.XSHE 平安银行 XSHE 2015-01-09 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
5838745 900957.XSHG 凌云B股 XSHG 2021-12-27 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
5838746 900957.XSHG 凌云B股 XSHG 2021-12-28 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
5838747 900957.XSHG 凌云B股 XSHG 2021-12-29 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
5838748 900957.XSHG 凌云B股 XSHG 2021-12-30 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
5838749 900957.XSHG 凌云B股 XSHG 2021-12-31 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 12 columns

In [152]:
df.select_dtypes(np.number).apply(lambda x: x**2+3)
Out[152]:
preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 1.671966e+06 1.710179e+06 24661553281603 2.084277e+19 8.552180e+09 3.000847 2.483276e+22 3.349881e+22
1 1.710179e+06 1.659323e+06 14146264367107 1.192629e+19 6.452106e+09 3.000484 2.409428e+22 3.250262e+22
2 1.659323e+06 1.596829e+06 8711948463204 6.942152e+18 5.284854e+09 3.000299 2.318685e+22 3.127853e+22
3 1.596829e+06 1.491352e+06 5972896490404 4.528399e+18 4.724363e+09 3.000204 2.165525e+22 2.921242e+22
4 1.491352e+06 1.515374e+06 18966364691524 1.471013e+19 9.976414e+09 3.000650 2.200405e+22 2.968295e+22
... ... ... ... ... ... ... ... ...
5838745 3.390625e+00 3.407044e+00 238642997124 9.495889e+10 5.107900e+04 3.000007 1.356573e+16 4.880432e+16
5838746 3.407044e+00 3.405769e+00 31578000807 1.251736e+10 4.099000e+03 3.000001 1.352290e+16 4.865024e+16
5838747 3.405769e+00 3.396900e+00 15264602503 5.980393e+09 3.367000e+03 3.000000 1.322500e+16 4.757852e+16
5838748 3.396900e+00 3.403225e+00 12904960003 5.084831e+09 1.684000e+03 3.000000 1.343745e+16 4.834282e+16
5838749 3.403225e+00 3.404496e+00 28156840003 1.123515e+10 5.479000e+03 3.000001 1.348014e+16 4.849641e+16

5838750 rows × 8 columns

In [153]:
df.groupby('secShortName')
Out[153]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f99b254a5e0>
In [154]:
df.groupby('secShortName').last()
Out[154]:
secID exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName
*ST万方 000638.XSHE XSHE 2021-12-31 21.428 21.560 550679 1.185508e+07 794.0 0.0079 1.509872e+09 1.509872e+09
*ST上普 600680.XSHG XSHG 2018-04-27 28.879 30.133 1272828 3.498398e+07 2843.0 0.0180 1.979601e+09 2.939313e+09
*ST东海A 000613.XSHE XSHE 2021-12-31 5.678 5.836 3854756 2.219116e+07 1384.0 0.0207 1.088870e+09 1.485528e+09
*ST东海B 200613.XSHE XSHE 2021-12-31 1.757 1.757 33909 6.009500e+04 13.0 0.0004 1.408000e+08 5.825600e+08
*ST东电 000585.XSHE XSHE 2021-12-31 2.617 2.617 5993090 1.543082e+07 1268.0 0.0123 1.249313e+09 1.790408e+09
... ... ... ... ... ... ... ... ... ... ... ...
龙竹科技 831445.XBEI XBEI 2021-12-31 20.824 21.098 456508 8.807597e+06 NaN 0.0068 1.297350e+09 2.217318e+09
龙腾光电 688055.XSHG XSHG 2021-12-31 6.808 6.858 4664701 3.186770e+07 3830.0 0.0026 1.223867e+10 2.283333e+10
龙蟠科技 603906.XSHG XSHG 2021-12-31 77.083 77.208 3700591 2.766969e+08 20421.0 0.0155 1.785667e+10 1.785667e+10
龙软科技 688078.XSHG XSHG 2021-12-31 62.467 59.057 963837 5.722752e+07 2401.0 0.0339 1.657041e+09 4.117650e+09
龙高股份 605086.XSHG XSHG 2021-12-31 23.830 23.620 833004 1.976806e+07 1403.0 0.0260 7.558400e+08 3.023360e+09

4853 rows × 11 columns

In [155]:
df.groupby('secShortName').first()
Out[155]:
secID exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName
*ST万方 000638.XSHE XSHE 2015-01-05 28.364 27.304 3103725 8.408389e+07 4279.0 0.0443 1.912092e+09 1.912092e+09
*ST上普 600680.XSHG XSHG 2015-01-05 57.366 57.562 483414 2.566049e+07 1787.0 0.0069 3.781578e+09 5.614890e+09
*ST东海A 000613.XSHE XSHE 2015-01-05 9.769 9.783 2242078 2.191941e+07 1159.0 0.0123 1.782263e+09 2.490444e+09
*ST东海B 200613.XSHE XSHE 2015-01-05 6.642 6.576 164636 1.085171e+06 50.0 0.0021 5.271200e+08 2.180959e+09
*ST东电 000585.XSHE XSHE 2015-01-05 5.183 5.272 9358955 4.800403e+07 4519.0 0.0192 2.516909e+09 3.607018e+09
... ... ... ... ... ... ... ... ... ... ... ...
龙竹科技 831445.XBEI XBEI 2020-07-27 9.180 7.840 4804739 4.053933e+07 NaN 0.0730 5.157141e+08 8.976079e+08
龙腾光电 688055.XSHG XSHG 2020-08-17 1.220 9.850 215585877 1.892487e+09 211520.0 0.8003 2.653273e+09 3.283333e+10
龙蟠科技 603906.XSHG XSHG 2017-04-10 9.520 13.710 11500 1.576650e+05 15.0 0.0002 7.129200e+08 2.851680e+09
龙软科技 688078.XSHG XSHG 2019-12-30 21.590 48.900 12454266 6.069590e+08 28417.0 0.7736 7.872460e+08 3.459675e+09
龙高股份 605086.XSHG XSHG 2021-04-16 12.860 18.520 226150 4.186135e+06 345.0 0.0071 5.926400e+08 2.370560e+09

4853 rows × 11 columns

In [156]:
df.groupby('secShortName').head(3)
Out[156]:
secID secShortName exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 000001.XSHE 平安银行 XSHE 2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 000001.XSHE 平安银行 XSHE 2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 000001.XSHE 平安银行 XSHE 2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
1705 000002.XSHE 万科A XSHE 2015-01-05 1567.420 1681.312 11835160 9.700712e+09 185938.0 0.0676 1.447266e+11 1.645692e+11
1706 000002.XSHE 万科A XSHE 2015-01-06 1681.312 1619.292 6036510 4.839616e+09 123274.0 0.0345 1.393879e+11 1.584986e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
5835695 900956.XSHG 东贝B股 XSHG 2015-01-06 1.159 1.158 438800 4.708660e+05 265.0 0.0038 1.242000e+08 2.538000e+08
5835696 900956.XSHG 东贝B股 XSHG 2015-01-07 1.158 1.159 429200 4.600200e+05 236.0 0.0037 1.243150e+08 2.540350e+08
5837045 900957.XSHG 凌云B股 XSHG 2015-01-05 0.620 0.628 615555 3.823390e+05 268.0 0.0033 1.146320e+08 2.174270e+08
5837046 900957.XSHG 凌云B股 XSHG 2015-01-06 0.628 0.623 459481 2.838860e+05 194.0 0.0025 1.137120e+08 2.156820e+08
5837047 900957.XSHG 凌云B股 XSHG 2015-01-07 0.623 0.631 703095 4.412970e+05 324.0 0.0038 1.151840e+08 2.184740e+08

14554 rows × 12 columns

In [157]:
df.groupby('secShortName').nth(-2)
Out[157]:
secID exchangeCD tradeDate preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName
*ST万方 000638.XSHE XSHE 2021-12-30 21.163 21.428 793779 1.689394e+07 1009.0 0.0113 1.500590e+09 1.500590e+09
*ST上普 600680.XSHG XSHG 2018-04-26 28.800 28.879 867415 2.348474e+07 1907.0 0.0123 1.897224e+09 2.817000e+09
*ST东海A 000613.XSHE XSHE 2021-12-30 5.521 5.678 2735594 1.541656e+07 1158.0 0.0147 1.059513e+09 1.445477e+09
*ST东海B 200613.XSHE XSHE 2021-12-30 1.757 1.757 261068 4.593800e+05 10.0 0.0033 1.408000e+08 5.825600e+08
*ST东电 000585.XSHE XSHE 2021-12-30 2.604 2.617 2416167 6.230545e+06 678.0 0.0050 1.249313e+09 1.790408e+09
... ... ... ... ... ... ... ... ... ... ... ...
龙竹科技 831445.XBEI XBEI 2021-12-30 20.167 20.824 561059 1.063724e+07 NaN 0.0083 1.280519e+09 2.188552e+09
龙腾光电 688055.XSHG XSHG 2021-12-30 6.768 6.808 4752067 3.232808e+07 3395.0 0.0027 1.214933e+10 2.266667e+10
龙蟠科技 603906.XSHG XSHG 2021-12-30 79.167 77.083 5338171 4.007450e+08 31920.0 0.0223 1.782774e+10 1.782774e+10
龙软科技 688078.XSHG XSHG 2021-12-30 61.980 62.467 387495 2.392429e+07 1454.0 0.0136 1.752706e+09 4.355370e+09
龙高股份 605086.XSHG XSHG 2021-12-30 23.510 23.830 1185853 2.812384e+07 2482.0 0.0371 7.625600e+08 3.050240e+09

4852 rows × 11 columns

multiindex (panel data)¶

In [158]:
arrays = [
    np.array(["bar", "bar", 'bar', "baz", "baz", "baz", "foo", "foo", "foo", "qux", "qux", "qux"]),
    np.array([1,2,3, 1,2,3, 1,2,3, 1,2,3]),
]
temp = pd.DataFrame(np.random.randn(12, 4), index=arrays)
In [159]:
temp
Out[159]:
0 1 2 3
bar 1 -1.742879 0.155123 -1.454690 -0.127365
2 0.958199 0.372288 -1.108044 -0.195609
3 0.379005 -1.867816 -0.234514 -1.230953
baz 1 -1.108997 -0.506065 -1.544889 0.692354
2 0.137478 -0.073801 2.395381 0.457265
3 -1.559135 0.570340 0.332941 0.428118
foo 1 -1.138488 0.911376 -0.733840 -2.143457
2 0.706423 -0.352932 1.191399 0.198311
3 -1.031937 -0.656603 1.706294 -0.295609
qux 1 0.470402 -0.726789 2.559832 1.079300
2 0.914655 1.224235 1.376842 0.244068
3 1.305349 0.966439 -0.521711 0.436194
In [161]:
temp.loc[('bar',2):('foo',3)]
Out[161]:
0 1 2 3
bar 2 0.958199 0.372288 -1.108044 -0.195609
3 0.379005 -1.867816 -0.234514 -1.230953
baz 1 -1.108997 -0.506065 -1.544889 0.692354
2 0.137478 -0.073801 2.395381 0.457265
3 -1.559135 0.570340 0.332941 0.428118
foo 1 -1.138488 0.911376 -0.733840 -2.143457
2 0.706423 -0.352932 1.191399 0.198311
3 -1.031937 -0.656603 1.706294 -0.295609
In [162]:
df['tradeDate'] = pd.to_datetime(df['tradeDate'], format='%Y-%m-%d')
In [163]:
df.set_index(['secShortName','tradeDate'])
Out[163]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName tradeDate
平安银行 2015-01-05 000001.XSHE XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 000001.XSHE XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 000001.XSHE XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 000001.XSHE XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 000001.XSHE XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ...
凌云B股 2021-12-27 900957.XSHG XSHG 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
2021-12-28 900957.XSHG XSHG 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
2021-12-29 900957.XSHG XSHG 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
2021-12-30 900957.XSHG XSHG 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
2021-12-31 900957.XSHG XSHG 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 10 columns

In [164]:
df.set_index(['secShortName','tradeDate'],inplace=True)
In [165]:
df
Out[165]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName tradeDate
平安银行 2015-01-05 000001.XSHE XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 000001.XSHE XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 000001.XSHE XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 000001.XSHE XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 000001.XSHE XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ...
凌云B股 2021-12-27 900957.XSHG XSHG 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
2021-12-28 900957.XSHG XSHG 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
2021-12-29 900957.XSHG XSHG 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
2021-12-30 900957.XSHG XSHG 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
2021-12-31 900957.XSHG XSHG 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 10 columns

In [166]:
df.groupby(['secShortName','tradeDate']).last()
Out[166]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName tradeDate
*ST万方 2015-01-05 000638.XSHE XSHE 28.364 27.304 3103725 84083888.60 4279.0 0.0443 1.912092e+09 1.912092e+09
2015-01-06 000638.XSHE XSHE 27.304 27.039 1926608 51665933.96 3060.0 0.0275 1.893528e+09 1.893528e+09
2015-01-07 000638.XSHE XSHE 27.039 26.995 1295798 35013368.40 1919.0 0.0185 1.890434e+09 1.890434e+09
2015-01-08 000638.XSHE XSHE 26.995 27.702 2224507 61274527.17 3560.0 0.0318 1.939938e+09 1.939938e+09
2015-01-09 000638.XSHE XSHE 27.702 27.348 1053730 29061439.79 1860.0 0.0151 1.915186e+09 1.915186e+09
... ... ... ... ... ... ... ... ... ... ... ...
龙高股份 2021-12-27 605086.XSHG XSHG 23.320 23.340 711496 16590677.00 1586.0 0.0222 7.468800e+08 2.987520e+09
2021-12-28 605086.XSHG XSHG 23.340 23.540 819496 19232165.00 1664.0 0.0256 7.532800e+08 3.013120e+09
2021-12-29 605086.XSHG XSHG 23.540 23.510 656443 15410019.00 1646.0 0.0205 7.523200e+08 3.009280e+09
2021-12-30 605086.XSHG XSHG 23.510 23.830 1185853 28123837.00 2482.0 0.0371 7.625600e+08 3.050240e+09
2021-12-31 605086.XSHG XSHG 23.830 23.620 833004 19768062.00 1403.0 0.0260 7.558400e+08 3.023360e+09

5838680 rows × 10 columns

In [167]:
df.loc['平安银行','2021']
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_12830/224712730.py:1: PerformanceWarning: indexing past lexsort depth may impact performance.
  df.loc['平安银行','2021']
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_12830/224712730.py:1: FutureWarning: Indexing a DataFrame with a datetimelike index using a single string to slice the rows, like `frame[string]`, is deprecated and will be removed in a future version. Use `frame.loc[string]` instead.
  df.loc['平安银行','2021']
Out[167]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2021-01-04 000001.XSHE XSHE 2466.388 2372.018 1873816 2.891682e+09 119328.0 0.0080 3.609470e+11 3.609501e+11
2021-01-05 000001.XSHE XSHE 2372.018 2317.181 2195884 3.284607e+09 187364.0 0.0094 3.526025e+11 3.526055e+11
2021-01-06 000001.XSHE XSHE 2317.181 2494.444 2332835 3.648522e+09 154714.0 0.0100 3.795765e+11 3.795798e+11
2021-01-07 000001.XSHE XSHE 2494.444 2537.804 1909948 3.111275e+09 123304.0 0.0082 3.861745e+11 3.861778e+11
2021-01-08 000001.XSHE XSHE 2537.804 2531.427 1441303 2.348316e+09 95828.0 0.0062 3.852042e+11 3.852075e+11
... ... ... ... ... ... ... ... ... ... ...
2021-12-27 000001.XSHE XSHE 2224.866 2213.298 881462 1.260455e+09 66911.0 0.0038 3.341671e+11 3.341699e+11
2021-12-28 000001.XSHE XSHE 2213.298 2206.872 1358314 1.934461e+09 89671.0 0.0058 3.331968e+11 3.331996e+11
2021-12-29 000001.XSHE XSHE 2206.872 2152.889 1771527 2.480535e+09 147243.0 0.0076 3.250464e+11 3.250491e+11
2021-12-30 000001.XSHE XSHE 2152.889 2161.886 960485 1.342374e+09 66792.0 0.0041 3.264048e+11 3.264075e+11
2021-12-31 000001.XSHE XSHE 2161.886 2118.185 2110777 2.899617e+09 152423.0 0.0090 3.198068e+11 3.198095e+11

243 rows × 10 columns

In [168]:
df.loc[('平安银行','2021')] # a tuple is interpreted as one multi-level key
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_12830/2348492628.py:1: PerformanceWarning: indexing past lexsort depth may impact performance.
  df.loc[('平安银行','2021')] # a tuple is interpreted as one multi-level key
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_12830/2348492628.py:1: FutureWarning: Indexing a DataFrame with a datetimelike index using a single string to slice the rows, like `frame[string]`, is deprecated and will be removed in a future version. Use `frame.loc[string]` instead.
  df.loc[('平安银行','2021')] # a tuple is interpreted as one multi-level key
Out[168]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2021-01-04 000001.XSHE XSHE 2466.388 2372.018 1873816 2.891682e+09 119328.0 0.0080 3.609470e+11 3.609501e+11
2021-01-05 000001.XSHE XSHE 2372.018 2317.181 2195884 3.284607e+09 187364.0 0.0094 3.526025e+11 3.526055e+11
2021-01-06 000001.XSHE XSHE 2317.181 2494.444 2332835 3.648522e+09 154714.0 0.0100 3.795765e+11 3.795798e+11
2021-01-07 000001.XSHE XSHE 2494.444 2537.804 1909948 3.111275e+09 123304.0 0.0082 3.861745e+11 3.861778e+11
2021-01-08 000001.XSHE XSHE 2537.804 2531.427 1441303 2.348316e+09 95828.0 0.0062 3.852042e+11 3.852075e+11
... ... ... ... ... ... ... ... ... ... ...
2021-12-27 000001.XSHE XSHE 2224.866 2213.298 881462 1.260455e+09 66911.0 0.0038 3.341671e+11 3.341699e+11
2021-12-28 000001.XSHE XSHE 2213.298 2206.872 1358314 1.934461e+09 89671.0 0.0058 3.331968e+11 3.331996e+11
2021-12-29 000001.XSHE XSHE 2206.872 2152.889 1771527 2.480535e+09 147243.0 0.0076 3.250464e+11 3.250491e+11
2021-12-30 000001.XSHE XSHE 2152.889 2161.886 960485 1.342374e+09 66792.0 0.0041 3.264048e+11 3.264075e+11
2021-12-31 000001.XSHE XSHE 2161.886 2118.185 2110777 2.899617e+09 152423.0 0.0090 3.198068e+11 3.198095e+11

243 rows × 10 columns

In [172]:
df.loc[['平安银行','2016']]# a list is used to specify several keys [on the same level]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[172], line 1
----> 1 df.loc[['平安银行','2016']]

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py:1073, in _LocationIndexer.__getitem__(self, key)
   1070 axis = self.axis or 0
   1072 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1073 return self._getitem_axis(maybe_callable, axis=axis)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py:1301, in _LocIndexer._getitem_axis(self, key, axis)
   1298     if hasattr(key, "ndim") and key.ndim > 1:
   1299         raise ValueError("Cannot index with multidimensional key")
-> 1301     return self._getitem_iterable(key, axis=axis)
   1303 # nested tuple slicing
   1304 if is_nested_tuple(key, labels):

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py:1239, in _LocIndexer._getitem_iterable(self, key, axis)
   1236 self._validate_key(key, axis)
   1238 # A collection of keys
-> 1239 keyarr, indexer = self._get_listlike_indexer(key, axis)
   1240 return self.obj._reindex_with_indexers(
   1241     {axis: [keyarr, indexer]}, copy=True, allow_dups=True
   1242 )

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py:1432, in _LocIndexer._get_listlike_indexer(self, key, axis)
   1429 ax = self.obj._get_axis(axis)
   1430 axis_name = self.obj._get_axis_name(axis)
-> 1432 keyarr, indexer = ax._get_indexer_strict(key, axis_name)
   1434 return keyarr, indexer

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py:2623, in MultiIndex._get_indexer_strict(self, key, axis_name)
   2620 if len(keyarr) and not isinstance(keyarr[0], tuple):
   2621     indexer = self._get_indexer_level_0(keyarr)
-> 2623     self._raise_if_missing(key, indexer, axis_name)
   2624     return self[indexer], indexer
   2626 return super()._get_indexer_strict(key, axis_name)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py:2641, in MultiIndex._raise_if_missing(self, key, indexer, axis_name)
   2639 cmask = check == -1
   2640 if cmask.any():
-> 2641     raise KeyError(f"{keyarr[cmask]} not in index")
   2642 # We get here when levels still contain values which are not
   2643 # actually in Index anymore
   2644 raise KeyError(f"{keyarr} not in index")

KeyError: "['2016'] not in index"
In [173]:
df.loc[('平安银行','2015':'2016')]
  Cell In[173], line 1
    df.loc[('平安银行','2015':'2016')]
                         ^
SyntaxError: invalid syntax
In [174]:
df.loc[('平安银行',pd.Timestamp('2015-01-05')):('平安银行', pd.Timestamp('2015-01-28'))]
---------------------------------------------------------------------------
UnsortedIndexError                        Traceback (most recent call last)
Cell In[174], line 1
----> 1 df.loc[('平安银行',pd.Timestamp('2015-01-05')):('平安银行', pd.Timestamp('2015-01-28'))]

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py:1073, in _LocationIndexer.__getitem__(self, key)
   1070 axis = self.axis or 0
   1072 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1073 return self._getitem_axis(maybe_callable, axis=axis)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py:1290, in _LocIndexer._getitem_axis(self, key, axis)
   1288 if isinstance(key, slice):
   1289     self._validate_key(key, axis)
-> 1290     return self._get_slice_axis(key, axis=axis)
   1291 elif com.is_bool_indexer(key):
   1292     return self._getbool_axis(key, axis=axis)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py:1324, in _LocIndexer._get_slice_axis(self, slice_obj, axis)
   1321     return obj.copy(deep=False)
   1323 labels = obj._get_axis(axis)
-> 1324 indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
   1326 if isinstance(indexer, slice):
   1327     return self.obj._slice(indexer, axis=axis)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py:6602, in Index.slice_indexer(self, start, end, step, kind)
   6559 """
   6560 Compute the slice indexer for input labels and step.
   6561 
   (...)
   6598 slice(1, 3, None)
   6599 """
   6600 self._deprecated_arg(kind, "kind", "slice_indexer")
-> 6602 start_slice, end_slice = self.slice_locs(start, end, step=step)
   6604 # return a slice
   6605 if not is_scalar(start_slice):

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py:2775, in MultiIndex.slice_locs(self, start, end, step, kind)
   2772 self._deprecated_arg(kind, "kind", "slice_locs")
   2773 # This function adds nothing to its parent implementation (the magic
   2774 # happens in get_slice_bound method), but it adds meaningful doc.
-> 2775 return super().slice_locs(start, end, step)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py:6810, in Index.slice_locs(self, start, end, step, kind)
   6808 start_slice = None
   6809 if start is not None:
-> 6810     start_slice = self.get_slice_bound(start, "left")
   6811 if start_slice is None:
   6812     start_slice = 0

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py:2714, in MultiIndex.get_slice_bound(self, label, side, kind)
   2712 if not isinstance(label, tuple):
   2713     label = (label,)
-> 2714 return self._partial_tup_index(label, side=side)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py:2779, in MultiIndex._partial_tup_index(self, tup, side)
   2777 def _partial_tup_index(self, tup: tuple, side: Literal["left", "right"] = "left"):
   2778     if len(tup) > self._lexsort_depth:
-> 2779         raise UnsortedIndexError(
   2780             f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth "
   2781             f"({self._lexsort_depth})"
   2782         )
   2784     n = len(tup)
   2785     start, end = 0, len(self)

UnsortedIndexError: 'Key length (2) was greater than MultiIndex lexsort depth (0)'
In [175]:
df2 = df.sort_index() # 必须先排序才可做slicing
In [176]:
df2.loc[('平安银行',pd.Timestamp('2015-01-05')):('平安银行', pd.Timestamp('2015-01-28'))]
Out[176]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName tradeDate
平安银行 2015-01-05 000001.XSHE XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 000001.XSHE XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 000001.XSHE XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 000001.XSHE XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 000001.XSHE XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
2015-01-12 000001.XSHE XSHE 1231.004 1205.698 2696688 2.293105e+09 64376.0 0.0158 1.452882e+11 1.687457e+11
2015-01-13 000001.XSHE XSHE 1205.698 1198.351 1418187 1.204987e+09 34924.0 0.0083 1.444029e+11 1.677175e+11
2015-01-14 000001.XSHE XSHE 1198.351 1208.963 2192762 1.889297e+09 50675.0 0.0128 1.456817e+11 1.692027e+11
2015-01-15 000001.XSHE XSHE 1208.963 1253.044 2156548 1.868796e+09 54470.0 0.0126 1.509935e+11 1.753721e+11
2015-01-16 000001.XSHE XSHE 1253.044 1254.677 2701125 2.403346e+09 60637.0 0.0158 1.511903e+11 1.756006e+11
2015-01-19 000001.XSHE XSHE 1254.677 1128.964 3710288 3.016203e+09 103111.0 0.0217 1.360417e+11 1.580063e+11
2015-01-20 000001.XSHE XSHE 1128.964 1128.964 2588576 2.064281e+09 65826.0 0.0152 1.360417e+11 1.580063e+11
2015-01-21 000001.XSHE XSHE 1128.964 1177.127 3368980 2.758193e+09 78709.0 0.0197 1.418454e+11 1.647470e+11
2015-01-22 000001.XSHE XSHE 1177.127 1167.331 2178850 1.801436e+09 53599.0 0.0128 1.406650e+11 1.633760e+11
2015-01-23 000001.XSHE XSHE 1167.331 1175.494 2533304 2.108747e+09 53672.0 0.0148 1.416487e+11 1.645185e+11
2015-01-26 000001.XSHE XSHE 1175.494 1170.596 1836123 1.508447e+09 47025.0 0.0108 1.410585e+11 1.638330e+11
2015-01-27 000001.XSHE XSHE 1170.596 1142.025 2325514 1.881059e+09 61710.0 0.0136 1.376156e+11 1.598343e+11
2015-01-28 000001.XSHE XSHE 1142.025 1147.739 2154303 1.742176e+09 55381.0 0.0126 1.383042e+11 1.606340e+11
In [177]:
df2.loc[pd.IndexSlice[('宁德时代','2015-01'):('平安银行','2020-02')]]
Out[177]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
secShortName tradeDate
宁德时代 2018-06-11 300750.XSHE XSHE 25.140 36.200 78829 2.845472e+06 154.0 0.0004 7.864222e+09 7.864222e+10
2018-06-12 300750.XSHE XSHE 36.200 39.820 26579 1.058376e+06 86.0 0.0001 8.650644e+09 8.650644e+10
2018-06-13 300750.XSHE XSHE 39.820 43.800 45030 1.972314e+06 98.0 0.0002 9.515274e+09 9.515274e+10
2018-06-14 300750.XSHE XSHE 43.800 48.180 74267 3.578184e+06 211.0 0.0003 1.046680e+10 1.046680e+11
2018-06-15 300750.XSHE XSHE 48.180 53.000 256510 1.359503e+07 717.0 0.0012 1.151392e+10 1.151392e+11
... ... ... ... ... ... ... ... ... ... ... ...
平安银行 2020-01-20 000001.XSHE XSHE 2055.130 2062.654 899493 1.226465e+09 42805.0 0.0038 3.192246e+11 3.192274e+11
2020-01-21 000001.XSHE XSHE 2062.654 2006.229 1080975 1.442171e+09 60130.0 0.0046 3.104920e+11 3.104947e+11
2020-01-22 000001.XSHE XSHE 2006.229 2017.514 867411 1.150933e+09 49245.0 0.0037 3.122386e+11 3.122412e+11
2020-01-23 000001.XSHE XSHE 2017.514 1948.549 1326911 1.723394e+09 71683.0 0.0057 3.015654e+11 3.015680e+11
2020-02-03 000001.XSHE XSHE 1948.549 1754.196 2723762 3.201454e+09 105935.0 0.0116 2.714865e+11 2.714888e+11

294154 rows × 10 columns

In [178]:
df.loc['平安银行'].loc['2015':'2016']
Out[178]:
secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2015-01-05 000001.XSHE XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 000001.XSHE XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 000001.XSHE XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 000001.XSHE XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 000001.XSHE XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ...
2016-12-26 000001.XSHE XSHE 1092.827 1097.641 364173 2.739343e+08 11409.0 0.0021 1.334364e+11 1.565942e+11
2016-12-27 000001.XSHE XSHE 1097.641 1092.827 324124 2.447883e+08 9433.0 0.0018 1.328511e+11 1.559073e+11
2016-12-28 000001.XSHE XSHE 1092.827 1090.420 405159 3.048986e+08 10133.0 0.0023 1.325585e+11 1.555639e+11
2016-12-29 000001.XSHE XSHE 1090.420 1092.827 408419 3.071835e+08 9453.0 0.0023 1.328511e+11 1.559073e+11
2016-12-30 000001.XSHE XSHE 1092.827 1095.234 364834 2.748827e+08 10822.0 0.0021 1.331437e+11 1.562507e+11

488 rows × 10 columns

merge¶

In [179]:
df1 = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[10,20,30]})
df2 = pd.DataFrame({'a':np.array([1,2,6]),
                    'b':[s for s in 'abc'],
                    'd':[s for s in 'zxz']})
In [180]:
df1
Out[180]:
a b c
0 1 4 10
1 2 5 20
2 3 6 30
In [181]:
df2
Out[181]:
a b d
0 1 a z
1 2 b x
2 6 c z
In [182]:
pd.merge(df1,df2,on='a')
Out[182]:
a b_x c b_y d
0 1 4 10 a z
1 2 5 20 b x
In [183]:
pd.merge(df1,df2,on='a',how='outer')
Out[183]:
a b_x c b_y d
0 1 4.0 10.0 a z
1 2 5.0 20.0 b x
2 3 6.0 30.0 NaN NaN
3 6 NaN NaN c z
In [184]:
pd.merge(df1,df2,on='a',how='left')
Out[184]:
a b_x c b_y d
0 1 4 10 a z
1 2 5 20 b x
2 3 6 30 NaN NaN
In [185]:
pd.merge(df1,df2,on='a',how='right')
Out[185]:
a b_x c b_y d
0 1 4.0 10.0 a z
1 2 5.0 20.0 b x
2 6 NaN NaN c z

Reshaping and pivoting¶

In [186]:
temp = tm.makeTimeDataFrame(3)
display(temp)
A B C D
2000-01-03 -0.522757 -1.607854 -0.000651 -0.264986
2000-01-04 -0.574578 -0.483190 -0.920019 0.332520
2000-01-05 1.490557 0.681432 -1.439317 -0.741802
In [187]:
pd.DataFrame(temp.stack()).reset_index()
Out[187]:
level_0 level_1 0
0 2000-01-03 A -0.522757
1 2000-01-03 B -1.607854
2 2000-01-03 C -0.000651
3 2000-01-03 D -0.264986
4 2000-01-04 A -0.574578
5 2000-01-04 B -0.483190
6 2000-01-04 C -0.920019
7 2000-01-04 D 0.332520
8 2000-01-05 A 1.490557
9 2000-01-05 B 0.681432
10 2000-01-05 C -1.439317
11 2000-01-05 D -0.741802
In [188]:
temp2 = pd.DataFrame(temp.stack()).reset_index()
temp2.columns=['date','variable','value']
In [189]:
temp2
Out[189]:
date variable value
0 2000-01-03 A -0.522757
1 2000-01-03 B -1.607854
2 2000-01-03 C -0.000651
3 2000-01-03 D -0.264986
4 2000-01-04 A -0.574578
5 2000-01-04 B -0.483190
6 2000-01-04 C -0.920019
7 2000-01-04 D 0.332520
8 2000-01-05 A 1.490557
9 2000-01-05 B 0.681432
10 2000-01-05 C -1.439317
11 2000-01-05 D -0.741802
In [190]:
def unpivot(frame):
    """
    Essentially the same as 'stack'
    """
    N, K = frame.shape
    data = {
        "value": frame.to_numpy().ravel("F"),
        "variable": np.asarray(frame.columns).repeat(N),
        "date": np.tile(np.asarray(frame.index), K),
    }
    return pd.DataFrame(data, columns=["date", "variable", "value"])
In [191]:
temp2 = unpivot(temp)
In [192]:
temp2
Out[192]:
date variable value
0 2000-01-03 A -0.522757
1 2000-01-04 A -0.574578
2 2000-01-05 A 1.490557
3 2000-01-03 B -1.607854
4 2000-01-04 B -0.483190
5 2000-01-05 B 0.681432
6 2000-01-03 C -0.000651
7 2000-01-04 C -0.920019
8 2000-01-05 C -1.439317
9 2000-01-03 D -0.264986
10 2000-01-04 D 0.332520
11 2000-01-05 D -0.741802
In [193]:
temp2.set_index(['date','variable'])
Out[193]:
value
date variable
2000-01-03 A -0.522757
2000-01-04 A -0.574578
2000-01-05 A 1.490557
2000-01-03 B -1.607854
2000-01-04 B -0.483190
2000-01-05 B 0.681432
2000-01-03 C -0.000651
2000-01-04 C -0.920019
2000-01-05 C -1.439317
2000-01-03 D -0.264986
2000-01-04 D 0.332520
2000-01-05 D -0.741802
In [194]:
temp2.set_index(['date','variable']).unstack()
Out[194]:
value
variable A B C D
date
2000-01-03 -0.522757 -1.607854 -0.000651 -0.264986
2000-01-04 -0.574578 -0.483190 -0.920019 0.332520
2000-01-05 1.490557 0.681432 -1.439317 -0.741802
In [195]:
temp2.pivot(index='date',columns='variable',values='value')
Out[195]:
variable A B C D
date
2000-01-03 -0.522757 -1.607854 -0.000651 -0.264986
2000-01-04 -0.574578 -0.483190 -0.920019 0.332520
2000-01-05 1.490557 0.681432 -1.439317 -0.741802
In [196]:
temp2['value2'] = temp2['value']**2
In [197]:
temp2
Out[197]:
date variable value value2
0 2000-01-03 A -0.522757 2.732746e-01
1 2000-01-04 A -0.574578 3.301401e-01
2 2000-01-05 A 1.490557 2.221761e+00
3 2000-01-03 B -1.607854 2.585194e+00
4 2000-01-04 B -0.483190 2.334727e-01
5 2000-01-05 B 0.681432 4.643496e-01
6 2000-01-03 C -0.000651 4.233486e-07
7 2000-01-04 C -0.920019 8.464347e-01
8 2000-01-05 C -1.439317 2.071633e+00
9 2000-01-03 D -0.264986 7.021733e-02
10 2000-01-04 D 0.332520 1.105695e-01
11 2000-01-05 D -0.741802 5.502701e-01
In [198]:
temp2.pivot(index='date',columns='variable')
Out[198]:
value value2
variable A B C D A B C D
date
2000-01-03 -0.522757 -1.607854 -0.000651 -0.264986 0.273275 2.585194 4.233486e-07 0.070217
2000-01-04 -0.574578 -0.483190 -0.920019 0.332520 0.330140 0.233473 8.464347e-01 0.110570
2000-01-05 1.490557 0.681432 -1.439317 -0.741802 2.221761 0.464350 2.071633e+00 0.550270
In [199]:
df.reset_index()
Out[199]:
secShortName tradeDate secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 平安银行 2015-01-05 000001.XSHE XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 平安银行 2015-01-06 000001.XSHE XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 平安银行 2015-01-07 000001.XSHE XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 平安银行 2015-01-08 000001.XSHE XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 平安银行 2015-01-09 000001.XSHE XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
5838745 凌云B股 2021-12-27 900957.XSHG XSHG 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
5838746 凌云B股 2021-12-28 900957.XSHG XSHG 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
5838747 凌云B股 2021-12-29 900957.XSHG XSHG 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
5838748 凌云B股 2021-12-30 900957.XSHG XSHG 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
5838749 凌云B股 2021-12-31 900957.XSHG XSHG 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 12 columns

In [200]:
df.reset_index(inplace=True)
In [202]:
df
Out[202]:
secShortName tradeDate secID exchangeCD preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
0 平安银行 2015-01-05 000001.XSHE XSHE 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
1 平安银行 2015-01-06 000001.XSHE XSHE 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2 平安银行 2015-01-07 000001.XSHE XSHE 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
3 平安银行 2015-01-08 000001.XSHE XSHE 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
4 平安银行 2015-01-09 000001.XSHE XSHE 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
5838745 凌云B股 2021-12-27 900957.XSHG XSHG 0.625 0.638 488511 3.081540e+05 226.0 0.0027 1.164720e+08 2.209170e+08
5838746 凌云B股 2021-12-28 900957.XSHG XSHG 0.638 0.637 177702 1.118810e+05 64.0 0.0010 1.162880e+08 2.205680e+08
5838747 凌云B股 2021-12-29 900957.XSHG XSHG 0.637 0.630 123550 7.733300e+04 58.0 0.0007 1.150000e+08 2.181250e+08
5838748 凌云B股 2021-12-30 900957.XSHG XSHG 0.630 0.635 113600 7.130800e+04 41.0 0.0006 1.159200e+08 2.198700e+08
5838749 凌云B股 2021-12-31 900957.XSHG XSHG 0.635 0.636 167800 1.059960e+05 74.0 0.0009 1.161040e+08 2.202190e+08

5838750 rows × 12 columns

In [203]:
df.pivot(index='tradeDate',columns='secID',values='closePrice').iloc[:,0:4]
Out[203]:
secID 000001.XSHE 000002.XSHE 000004.XSHE 000005.XSHE
tradeDate
2015-01-05 1307.737 1681.312 107.225 NaN
2015-01-06 1288.146 1619.292 112.487 NaN
2015-01-07 1263.656 1604.632 112.145 NaN
2015-01-08 1221.208 1532.463 115.630 NaN
2015-01-09 1231.004 1516.676 112.282 NaN
... ... ... ... ...
2021-12-27 2213.298 2919.261 128.342 22.207
2021-12-28 2206.872 2928.188 133.399 22.502
2021-12-29 2152.889 2883.551 132.510 22.502
2021-12-30 2161.886 2844.866 133.945 22.502
2021-12-31 2118.185 2940.092 132.715 22.600

1705 rows × 4 columns

In [204]:
df.reset_index().pivot(index='tradeDate',columns='secID',values='closePrice').iloc[:,0:4].plot()
Out[204]:
<AxesSubplot: xlabel='tradeDate'>

Plotting¶

  • 画图的基础工具是 Matplotlib
  • Matplotlib 可以做精细的操作,但很多时候我们不想事必躬亲,或者我们想先快速地获得一个还不错的图,需要时再微调
  • pandas内置可以画图。更复杂的可以用Seaborn(构建于Matplotlib之上)
  • 其他常见的画图包:plotly, ggplot, bokeh, etc.

Matplotlib basics¶

In [205]:
import matplotlib as mpl
import matplotlib.pyplot as plt

axes: 执行画图的instance,子图

axis: x, y

为什么只画一个图的时候,也经常看到写fig, ax = plt.subplots()?

因为通常画图时候需要保存。

In [206]:
plt.subplots?
Signature:
plt.subplots(
    nrows=1,
    ncols=1,
    *,
    sharex=False,
    sharey=False,
    squeeze=True,
    width_ratios=None,
    height_ratios=None,
    subplot_kw=None,
    gridspec_kw=None,
    **fig_kw,
)
Docstring:
Create a figure and a set of subplots.

This utility wrapper makes it convenient to create common layouts of
subplots, including the enclosing figure object, in a single call.

Parameters
----------
nrows, ncols : int, default: 1
    Number of rows/columns of the subplot grid.

sharex, sharey : bool or {'none', 'all', 'row', 'col'}, default: False
    Controls sharing of properties among x (*sharex*) or y (*sharey*)
    axes:

    - True or 'all': x- or y-axis will be shared among all subplots.
    - False or 'none': each subplot x- or y-axis will be independent.
    - 'row': each subplot row will share an x- or y-axis.
    - 'col': each subplot column will share an x- or y-axis.

    When subplots have a shared x-axis along a column, only the x tick
    labels of the bottom subplot are created. Similarly, when subplots
    have a shared y-axis along a row, only the y tick labels of the first
    column subplot are created. To later turn other subplots' ticklabels
    on, use `~matplotlib.axes.Axes.tick_params`.

    When subplots have a shared axis that has units, calling
    `~matplotlib.axis.Axis.set_units` will update each axis with the
    new units.

squeeze : bool, default: True
    - If True, extra dimensions are squeezed out from the returned
      array of `~matplotlib.axes.Axes`:

      - if only one subplot is constructed (nrows=ncols=1), the
        resulting single Axes object is returned as a scalar.
      - for Nx1 or 1xM subplots, the returned object is a 1D numpy
        object array of Axes objects.
      - for NxM, subplots with N>1 and M>1 are returned as a 2D array.

    - If False, no squeezing at all is done: the returned Axes object is
      always a 2D array containing Axes instances, even if it ends up
      being 1x1.

width_ratios : array-like of length *ncols*, optional
    Defines the relative widths of the columns. Each column gets a
    relative width of ``width_ratios[i] / sum(width_ratios)``.
    If not given, all columns will have the same width.  Equivalent
    to ``gridspec_kw={'width_ratios': [...]}``.

height_ratios : array-like of length *nrows*, optional
    Defines the relative heights of the rows. Each row gets a
    relative height of ``height_ratios[i] / sum(height_ratios)``.
    If not given, all rows will have the same height. Convenience
    for ``gridspec_kw={'height_ratios': [...]}``.

subplot_kw : dict, optional
    Dict with keywords passed to the
    `~matplotlib.figure.Figure.add_subplot` call used to create each
    subplot.

gridspec_kw : dict, optional
    Dict with keywords passed to the `~matplotlib.gridspec.GridSpec`
    constructor used to create the grid the subplots are placed on.

**fig_kw
    All additional keyword arguments are passed to the
    `.pyplot.figure` call.

Returns
-------
fig : `.Figure`

ax : `~.axes.Axes` or array of Axes
    *ax* can be either a single `~.axes.Axes` object, or an array of Axes
    objects if more than one subplot was created.  The dimensions of the
    resulting array can be controlled with the squeeze keyword, see above.

    Typical idioms for handling the return value are::

        # using the variable ax for single a Axes
        fig, ax = plt.subplots()

        # using the variable axs for multiple Axes
        fig, axs = plt.subplots(2, 2)

        # using tuple unpacking for multiple Axes
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)

    The names ``ax`` and pluralized ``axs`` are preferred over ``axes``
    because for the latter it's not clear if it refers to a single
    `~.axes.Axes` instance or a collection of these.

See Also
--------
.pyplot.figure
.pyplot.subplot
.pyplot.axes
.Figure.subplots
.Figure.add_subplot

Examples
--------
::

    # First create some toy data:
    x = np.linspace(0, 2*np.pi, 400)
    y = np.sin(x**2)

    # Create just a figure and only one subplot
    fig, ax = plt.subplots()
    ax.plot(x, y)
    ax.set_title('Simple plot')

    # Create two subplots and unpack the output array immediately
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
    ax1.plot(x, y)
    ax1.set_title('Sharing Y axis')
    ax2.scatter(x, y)

    # Create four polar axes and access them through the returned array
    fig, axs = plt.subplots(2, 2, subplot_kw=dict(projection="polar"))
    axs[0, 0].plot(x, y)
    axs[1, 1].scatter(x, y)

    # Share a X axis with each column of subplots
    plt.subplots(2, 2, sharex='col')

    # Share a Y axis with each row of subplots
    plt.subplots(2, 2, sharey='row')

    # Share both X and Y axes with all subplots
    plt.subplots(2, 2, sharex='all', sharey='all')

    # Note that this is the same as
    plt.subplots(2, 2, sharex=True, sharey=True)

    # Create figure number 10 with a single subplot
    # and clears it if it already exists.
    fig, ax = plt.subplots(num=10, clear=True)
File:      ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/pyplot.py
Type:      function
In [207]:
fig = plt.figure()  # an empty figure with no Axes
fig, ax = plt.subplots()  # a figure with a single Axes
fig, axs = plt.subplots(2, 2)  # a figure with a 2x2 grid of Axes
<Figure size 640x480 with 0 Axes>
In [208]:
fig, ax = plt.subplots()
ax.plot([1, 2, 3, 4], [1, 4, 2, 3])
Out[208]:
[<matplotlib.lines.Line2D at 0x7f97eb2adfa0>]
In [209]:
temp2
Out[209]:
date variable value value2
0 2000-01-03 A -0.522757 2.732746e-01
1 2000-01-04 A -0.574578 3.301401e-01
2 2000-01-05 A 1.490557 2.221761e+00
3 2000-01-03 B -1.607854 2.585194e+00
4 2000-01-04 B -0.483190 2.334727e-01
5 2000-01-05 B 0.681432 4.643496e-01
6 2000-01-03 C -0.000651 4.233486e-07
7 2000-01-04 C -0.920019 8.464347e-01
8 2000-01-05 C -1.439317 2.071633e+00
9 2000-01-03 D -0.264986 7.021733e-02
10 2000-01-04 D 0.332520 1.105695e-01
11 2000-01-05 D -0.741802 5.502701e-01
In [210]:
fig, axs = plt.subplots(1,2)
x = range(temp2.shape[0])
y1 = temp2['value']
y2 = temp2['value']**2
axs[0].plot(x, y1)
axs[1].plot(x, y2)
axs[0].plot(x, y1**3)
Out[210]:
[<matplotlib.lines.Line2D at 0x7f97eb3938e0>]
In [211]:
fig
Out[211]:
In [212]:
x = pd.date_range(start='2021-01-03',periods=temp2.shape[0])
plt.plot(x,temp2['value'], label='random value')
plt.plot(x,temp2['value']**2, label='quadratic of random value')
plt.xlabel('date')
plt.ylabel('value')
plt.title('Example')
plt.legend()
Out[212]:
<matplotlib.legend.Legend at 0x7f97eb2fae50>

an Seaborn example with masked data¶

In [213]:
df_pingan.select_dtypes(np.number).corr()
Out[213]:
preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
preClosePrice 1.000000 0.997575 0.064982 0.389881 0.601451 -0.039942 0.989830 0.994295
closePrice 0.997575 1.000000 0.075458 0.400851 0.606522 -0.030710 0.991942 0.996452
turnoverVol 0.064982 0.075458 1.000000 0.924077 0.752066 0.981827 0.027619 0.055961
turnoverValue 0.389881 0.400851 0.924077 1.000000 0.914040 0.862471 0.355060 0.383088
dealAmount 0.601451 0.606522 0.752066 0.914040 1.000000 0.665757 0.573258 0.596717
turnoverRate -0.039942 -0.030710 0.981827 0.862471 0.665757 1.000000 -0.090386 -0.050985
negMarketValue 0.989830 0.991942 0.027619 0.355060 0.573258 -0.090386 1.000000 0.993615
marketValue 0.994295 0.996452 0.055961 0.383088 0.596717 -0.050985 0.993615 1.000000
In [214]:
corr = df_pingan.select_dtypes(np.number).corr()
In [215]:
corr
Out[215]:
preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
preClosePrice 1.000000 0.997575 0.064982 0.389881 0.601451 -0.039942 0.989830 0.994295
closePrice 0.997575 1.000000 0.075458 0.400851 0.606522 -0.030710 0.991942 0.996452
turnoverVol 0.064982 0.075458 1.000000 0.924077 0.752066 0.981827 0.027619 0.055961
turnoverValue 0.389881 0.400851 0.924077 1.000000 0.914040 0.862471 0.355060 0.383088
dealAmount 0.601451 0.606522 0.752066 0.914040 1.000000 0.665757 0.573258 0.596717
turnoverRate -0.039942 -0.030710 0.981827 0.862471 0.665757 1.000000 -0.090386 -0.050985
negMarketValue 0.989830 0.991942 0.027619 0.355060 0.573258 -0.090386 1.000000 0.993615
marketValue 0.994295 0.996452 0.055961 0.383088 0.596717 -0.050985 0.993615 1.000000
In [216]:
np.ones_like?
Signature: np.ones_like(a, dtype=None, order='K', subok=True, shape=None)
Docstring:
Return an array of ones with the same shape and type as a given array.

Parameters
----------
a : array_like
    The shape and data-type of `a` define these same attributes of
    the returned array.
dtype : data-type, optional
    Overrides the data type of the result.

    .. versionadded:: 1.6.0
order : {'C', 'F', 'A', or 'K'}, optional
    Overrides the memory layout of the result. 'C' means C-order,
    'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
    'C' otherwise. 'K' means match the layout of `a` as closely
    as possible.

    .. versionadded:: 1.6.0
subok : bool, optional.
    If True, then the newly created array will use the sub-class
    type of `a`, otherwise it will be a base-class array. Defaults
    to True.
shape : int or sequence of ints, optional.
    Overrides the shape of the result. If order='K' and the number of
    dimensions is unchanged, will try to keep order, otherwise,
    order='C' is implied.

    .. versionadded:: 1.17.0

Returns
-------
out : ndarray
    Array of ones with the same shape and type as `a`.

See Also
--------
empty_like : Return an empty array with shape and type of input.
zeros_like : Return an array of zeros with shape and type of input.
full_like : Return a new array with shape of input filled with value.
ones : Return a new array setting values to one.

Examples
--------
>>> x = np.arange(6)
>>> x = x.reshape((2, 3))
>>> x
array([[0, 1, 2],
       [3, 4, 5]])
>>> np.ones_like(x)
array([[1, 1, 1],
       [1, 1, 1]])

>>> y = np.arange(3, dtype=float)
>>> y
array([0., 1., 2.])
>>> np.ones_like(y)
array([1.,  1.,  1.])
File:      ~/opt/anaconda3/lib/python3.9/site-packages/numpy/core/numeric.py
Type:      function
In [217]:
np.ones_like(corr, dtype=bool)
Out[217]:
array([[ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True]])
In [218]:
np.triu?
Signature: np.triu(m, k=0)
Docstring:
Upper triangle of an array.

Return a copy of an array with the elements below the `k`-th diagonal
zeroed. For arrays with ``ndim`` exceeding 2, `triu` will apply to the
final two axes.

Please refer to the documentation for `tril` for further details.

See Also
--------
tril : lower triangle of an array

Examples
--------
>>> np.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], -1)
array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 0,  8,  9],
       [ 0,  0, 12]])

>>> np.triu(np.arange(3*4*5).reshape(3, 4, 5))
array([[[ 0,  1,  2,  3,  4],
        [ 0,  6,  7,  8,  9],
        [ 0,  0, 12, 13, 14],
        [ 0,  0,  0, 18, 19]],
       [[20, 21, 22, 23, 24],
        [ 0, 26, 27, 28, 29],
        [ 0,  0, 32, 33, 34],
        [ 0,  0,  0, 38, 39]],
       [[40, 41, 42, 43, 44],
        [ 0, 46, 47, 48, 49],
        [ 0,  0, 52, 53, 54],
        [ 0,  0,  0, 58, 59]]])
File:      ~/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/twodim_base.py
Type:      function
In [219]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
In [220]:
mask
Out[220]:
array([[ True,  True,  True,  True,  True,  True,  True,  True],
       [False,  True,  True,  True,  True,  True,  True,  True],
       [False, False,  True,  True,  True,  True,  True,  True],
       [False, False, False,  True,  True,  True,  True,  True],
       [False, False, False, False,  True,  True,  True,  True],
       [False, False, False, False, False,  True,  True,  True],
       [False, False, False, False, False, False,  True,  True],
       [False, False, False, False, False, False, False,  True]])
In [221]:
import seaborn as sns
In [222]:
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(18, 18))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True,
            square=True, linewidths=.5, annot_kws={"size": 10},cbar_kws={"shrink": .5})
Out[222]:
<AxesSubplot: >

Draw from pandas¶

In [223]:
df_pingan_num = df_pingan.select_dtypes(np.number)
In [224]:
df_pingan_num
Out[224]:
preClosePrice closePrice turnoverVol turnoverValue dealAmount turnoverRate negMarketValue marketValue
tradeDate
2015-01-05 1293.044 1307.737 4966040 4.565388e+09 92478.0 0.0291 1.575841e+11 1.830268e+11
2015-01-06 1307.737 1288.146 3761152 3.453446e+09 80325.0 0.0220 1.552233e+11 1.802848e+11
2015-01-07 1288.146 1263.656 2951601 2.634796e+09 72697.0 0.0173 1.522723e+11 1.768574e+11
2015-01-08 1263.656 1221.208 2443951 2.128003e+09 68734.0 0.0143 1.471572e+11 1.709164e+11
2015-01-09 1221.208 1231.004 4355039 3.835378e+09 99882.0 0.0255 1.483376e+11 1.722874e+11
... ... ... ... ... ... ... ... ...
2021-12-27 2224.866 2213.298 881462 1.260455e+09 66911.0 0.0038 3.341671e+11 3.341699e+11
2021-12-28 2213.298 2206.872 1358314 1.934461e+09 89671.0 0.0058 3.331968e+11 3.331996e+11
2021-12-29 2206.872 2152.889 1771527 2.480535e+09 147243.0 0.0076 3.250464e+11 3.250491e+11
2021-12-30 2152.889 2161.886 960485 1.342374e+09 66792.0 0.0041 3.264048e+11 3.264075e+11
2021-12-31 2161.886 2118.185 2110777 2.899617e+09 152423.0 0.0090 3.198068e+11 3.198095e+11

1705 rows × 8 columns

In [225]:
df_pingan_num.plot()
Out[225]:
<AxesSubplot: xlabel='tradeDate'>
In [226]:
plt.rcParams['figure.figsize'] = (16.0, 9.0)
In [227]:
df_pingan_num.drop(['negMarketValue','marketValue'],axis=1).plot()
Out[227]:
<AxesSubplot: xlabel='tradeDate'>
In [228]:
def standardization(col):
    return (col - col.mean()) / col.std()
In [229]:
df_pingan_num.apply(standardization).plot()
Out[229]:
<AxesSubplot: xlabel='tradeDate'>
In [230]:
df_pingan_num.apply(standardization)[['preClosePrice','turnoverValue','marketValue']].plot(style=['bs-', 'ro-', 'y^-'])
Out[230]:
<AxesSubplot: xlabel='tradeDate'>
In [231]:
df_pingan_num.plot(subplots=True)
Out[231]:
array([<AxesSubplot: xlabel='tradeDate'>,
       <AxesSubplot: xlabel='tradeDate'>,
       <AxesSubplot: xlabel='tradeDate'>,
       <AxesSubplot: xlabel='tradeDate'>,
       <AxesSubplot: xlabel='tradeDate'>,
       <AxesSubplot: xlabel='tradeDate'>,
       <AxesSubplot: xlabel='tradeDate'>,
       <AxesSubplot: xlabel='tradeDate'>], dtype=object)