当前位置：网站首页>Numpy cheatsheet

Numpy cheatsheet

2022-07-24 05:51:00 【Ml -- xiaoxiaobai】

1. ndarray Object insider

import numpy as np

np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

It reflects the step length traversed on different axes , You can see each of them np.float64 Is the length of the 8Byte=64bit. The calculation cost of the shaft with large span is higher .

np.ones((3, 4, 5), dtype=np.float32).strides

(80, 20, 4)

np.ones((3, 4, 5), dtype=np.uint16).strides

(40, 10, 2)

np.ones((3, 4, 5), dtype=np.float32).strides

(80, 20, 4)

np.ones((3, 4, 5), dtype=np.integer).strides

(160, 40, 8)

1.1 Numpy dtype hierarchy

ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)

ints.dtype

dtype('uint16')

np.issubdtype(ints.dtype, np.integer)

True

np.issubdtype(ints.dtype, np.floating)

False

np.issubdtype(floats.dtype, np.floating)

True

np.issubdtype('float', np.floating)

True

#  Look at the parent class 
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

np.uint16.mro()

[numpy.uint16,
 numpy.unsignedinteger,
 numpy.integer,
 numpy.number,
 numpy.generic,
 object]

np.issubdtype(np.float64, float)

True

np.issubdtype(np.float32, float)

False

np.float32.mro()

[numpy.float32,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 object]

2. High order array operation

2.1 Reshape the array

arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

# C-order  Reshape in line direction 
# F-order  Reshape according to the column direction 
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

arr.reshape((4, 2), order='F')

array([[0, 4],
       [1, 5],
       [2, 6],
       [3, 7]])

arr.reshape((-1, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

#  flat ：flatten（）, Return a copy of the data 
#  Decentralization ：ravel（）, No copies , Directly change the original array value
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

arr.flatten()[2]=99
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

arr.ravel()[2] = 99

arr

array([[ 0,  1, 99],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

arr.flatten(order='F')

array([ 0,  3,  6,  9, 12,  1,  4,  7, 10, 13, 99,  5,  8, 11, 14])

2.2 C The order and Fortran The order

# C-order  From an array shape The axis at the end of the attribute starts traversing forward , namely , First traverse the axis of the high index bit 
# F-order  From an array shape The axis at the beginning of the attribute starts traversing backwards , namely , Traverse the axis of the low index bit first 
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

arr.ravel(order='f')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

2.3 Join and split arrays

arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])

# axis Specify which axis to splice 
np.concatenate([arr1, arr2], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

arr3 = np.array([1, 2, 3])
arr4 = np.array([4, 5, 6])

np.column_stack((arr3, arr4)) #  Will automatically 1D Array to 2D

array([[1, 4],
       [2, 5],
       [3, 6]])

np.column_stack((arr3[None, :], arr4[None, :]))

array([[1, 2, 3, 4, 5, 6]])

np.hstack((arr3, arr4)) #  No river 1D Array to 2D

array([1, 2, 3, 4, 5, 6])

np.row_stack((arr3, arr4))

array([[1, 2, 3],
       [4, 5, 6]])

np.vstack((arr3, arr4))

array([[1, 2, 3],
       [4, 5, 6]])

# split  cutting / Split array 
arr = np.random.randn(5, 4)
print(arr)
first, second, third = np.split(arr, [1, 3], axis=1)
first, second, third

[[ 0.84435272 -1.41113975  0.15211615 -1.09964343]
 [-0.05692753  1.54980138  0.84249968 -0.2415958 ]
 [ 1.28897645 -0.00376694 -0.09085957  0.38944626]
 [ 1.08790741  0.13027303 -0.82255024 -0.42911225]
 [-1.70847678 -0.84645461 -0.2277294  -0.61552024]]





(array([[ 0.84435272],
        [-0.05692753],
        [ 1.28897645],
        [ 1.08790741],
        [-1.70847678]]),
 array([[-1.41113975,  0.15211615],
        [ 1.54980138,  0.84249968],
        [-0.00376694, -0.09085957],
        [ 0.13027303, -0.82255024],
        [-0.84645461, -0.2277294 ]]),
 array([[-1.09964343],
        [-0.2415958 ],
        [ 0.38944626],
        [-0.42911225],
        [-0.61552024]]))

first, second, third = np.hsplit(arr, [1, 3])
first, second, third

(array([[ 0.84435272],
        [-0.05692753],
        [ 1.28897645],
        [ 1.08790741],
        [-1.70847678]]),
 array([[-1.41113975,  0.15211615],
        [ 1.54980138,  0.84249968],
        [-0.00376694, -0.09085957],
        [ 0.13027303, -0.82255024],
        [-0.84645461, -0.2277294 ]]),
 array([[-1.09964343],
        [-0.2415958 ],
        [ 0.38944626],
        [-0.42911225],
        [-0.61552024]]))

first, second, third = np.vsplit(arr, [1, 3])
first, second, third

(array([[ 0.84435272, -1.41113975,  0.15211615, -1.09964343]]),
 array([[-0.05692753,  1.54980138,  0.84249968, -0.2415958 ],
        [ 1.28897645, -0.00376694, -0.09085957,  0.38944626]]),
 array([[ 1.08790741,  0.13027303, -0.82255024, -0.42911225],
        [-1.70847678, -0.84645461, -0.2277294 , -0.61552024]]))

2.3.1 Stacking assistant ：r_ and c_

arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)

np.row_stack([arr1, arr2])

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.70333255,  0.26275106],
       [ 1.99202938, -1.46490714],
       [ 0.73384258,  0.43515298]])

np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.70333255,  0.26275106],
       [ 1.99202938, -1.46490714],
       [ 0.73384258,  0.43515298]])

np.column_stack([np.vstack([arr1, arr2]), arr])

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.70333255,  0.26275106,  3.        ],
       [ 1.99202938, -1.46490714,  4.        ],
       [ 0.73384258,  0.43515298,  5.        ]])

np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.70333255,  0.26275106,  3.        ],
       [ 1.99202938, -1.46490714,  4.        ],
       [ 0.73384258,  0.43515298,  5.        ]])

np.column_stack([1:6, -10:-5])

  File "<ipython-input-81-d3ac66c3723b>", line 1
    np.column_stack([1:6, -10:-5])
                      ^
SyntaxError: invalid syntax

np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

2.4 Repetitive elements ：tile and repeat

# repeat  Elements repeat 
arr = np.arange(3)
arr

array([0, 1, 2])

arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

arr.repeat([2, 1, 4])

array([0, 0, 1, 2, 2, 2, 2])

arr = np.random.randn(2, 3)
arr

array([[ 0.16201186, -0.76919668, -0.71688664],
       [-0.70057032, -0.96810267, -1.61212582]])

arr.repeat([2, 1, 4, 2, 2, 2]) #  Will flatten

array([ 0.16201186,  0.16201186, -0.76919668, -0.71688664, -0.71688664,
       -0.71688664, -0.71688664, -0.70057032, -0.70057032, -0.96810267,
       -0.96810267, -1.61212582, -1.61212582])

arr = np.random.randn(2, 2)
arr

array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

arr.repeat(2) #  If the axis is not specified , It will cause flattening

array([ 1.28555077,  1.28555077,  0.33821435,  0.33821435, -0.51810958,
       -0.51810958, -1.69789573, -1.69789573])

arr.repeat(2, axis=0)

array([[ 1.28555077,  0.33821435],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573],
       [-0.51810958, -1.69789573]])

arr.repeat(2, axis=1)

array([[ 1.28555077,  1.28555077,  0.33821435,  0.33821435],
       [-0.51810958, -0.51810958, -1.69789573, -1.69789573]])

arr.repeat([2, 1], axis=0)

array([[ 1.28555077,  0.33821435],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

arr.repeat([2, 3], axis=1)

array([[ 1.28555077,  1.28555077,  0.33821435,  0.33821435,  0.33821435],
       [-0.51810958, -0.51810958, -1.69789573, -1.69789573, -1.69789573]])

# tile  Is the direct whole array repetition 
arr

array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

np.tile(arr, 2)

array([[ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573]])

np.tile(arr, [2, 1])

array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

np.tile(arr, (3, 2))

array([[ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573]])

2.5 The equivalent method of magic index ： take and put

arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

arr.take(inds)

array([700, 100, 200, 600])

arr.put(inds, 22)

arr

array([  0,  22,  22, 300, 400, 500,  22,  22, 800, 900])

arr.put(inds, [1, 2, 3, 4])
arr

array([  0,   2,   3, 300, 400, 500,   4,   1, 800, 900])

arr = np.random.randn(2, 4)
inds = [2, 0, 2, 1]
arr

array([[-0.49565355,  0.78522712, -0.06629777, -1.00791514],
       [ 0.39132436, -0.52828662, -0.82480479,  0.49250005]])

arr.take(inds, axis=1)

array([[-0.06629777, -0.49565355, -0.06629777,  0.78522712],
       [-0.82480479,  0.39132436, -0.82480479, -0.52828662]])

Be careful ：put Don't accept axis Parameters

3. radio broadcast broadcast

First , In fact, the operation between array and scalar is actually a kind of broadcast first , after element-wise Arithmetic

import numpy as np

arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

arr * 4

array([ 0,  4,  8, 12, 16])

The rule of broadcasting is each end dimension , The shaft length matches or the length is 1, The broadcast will be on the lost axis , such as (4, 3) + (3,) The following array has the same axis length at the end of the compound , Will broadcast a missing axis ; perhaps , The broadcast has a shaft length of 1 On the axis , such as (4, 3) + (1, 3), The shaft 0 from 1 Broadcast for 4. For array and scalar operations , In fact, it also uses radio , such as (4, 3) + scale, among scale Of shape In fact, it can be considered that (1,), Then the end length is 1, On the radio , The dimension at the end is broadcast as 3, The missing axis is broadcast as 4.

Broadcast can be performed in both arrays , such as (4, 4) + (4, 1, 4), First, the shaft length at the end is consistent , Secondly, one of the inconsistent shaft lengths is 1, that 1 The broadcast became 4, In addition, the missing axis broadcast is 4.

Based on this rule , Sometimes I want to calculate (4, 3) + (4, 1), In fact, the latter is (4,) When , Because the end shaft length is not 1, and 3 And 4 Don't match , Therefore, it is not possible to broadcast , Must pass reshape, perhaps [:, None] The way to increase the coordinate axis , Or make use of np.newaxis

therefore , In fact, grasping the end axis length of two arrays is the key , At first glance, the shaft length is neither 1, It's not the same , Then don't think about broadcasting , Let's see how to write circular operation .

arr = np.random.randn(4, 3)
arr.mean(0)

array([ 0.27783846,  0.36009253, -0.1499029 ])

demeaned = arr - arr.mean(0)
demeaned

array([[-0.79969385, -1.6011334 , -0.00747013],
       [-0.0381061 ,  0.64865496, -0.97992594],
       [ 1.13694786,  0.81091045,  0.73967573],
       [-0.29914791,  0.14156799,  0.24772034]])

arr.shape, arr.mean(0).shape

((4, 3), (3,))

aaa = np.array([1])
aaa.shape

(1,)

ans = arr - aaa
arr - ans

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

arr.shape, aaa.shape

((4, 3), (1,))

arr.shape

(4, 3)

arr.mean(1).shape

(4,)

arr - arr.mean(1)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-18-8b8ada26fac0> in <module>
----> 1 arr - arr.mean(1)


ValueError: operands could not be broadcast together with shapes (4,3) (4,)

arr - arr.mean(1).reshape(-1, 1)

array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])

arr - arr.mean(1)[:, None]

array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])

arr - arr.mean(1)[:, np.newaxis]

array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])

A three-dimensional example ：

arr = np.ones((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr.shape, arr_3d.shape

((4, 4), (4, 1, 4))

arr + arr_3d

array([[[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]]])

A common pattern , such as , Subtract / Remove the sum of certain axes / variance / Mean value or something ：

arr = np.random.randn(3, 4, 5)
#  Adding is called subtracting 1 The mean value of the axis 
means = arr.mean(1)
means

array([[-0.95808939, -0.59395877,  0.44605451,  0.06325242,  0.14369531],
       [ 0.2600657 , -0.92595688, -0.75528343, -0.2486933 , -0.02936524],
       [-0.22052564,  0.14549496, -0.67660057, -0.10151047,  0.26275483]])

arr.shape, means.shape

((3, 4, 5), (3, 5))

demeaned = arr - means[:, np.newaxis, :]

demeaned.mean(1) < 1e-16

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

#  It can be written as a function 
def demean_axis(arr, axis=0):
    means = arr.mean(axis)
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

arr = np.random.randn(3, 4, 5)
demeaned = demean_axis(arr, axis=1)
demeaned.mean(1) < 1e-16

<ipython-input-45-8051ed80feee>:6: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return arr - means[(indexer)]





array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In fact, array assignment also uses broadcasting ：

arr = np.zeros((4, 3))
col = np.array([1.28, 0, 33, 0.5])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [ 0.  ,  0.  ,  0.  ],
       [33.  , 33.  , 33.  ],
       [ 0.5 ,  0.5 ,  0.5 ]])

arr[:2] = [[2], [3]]
arr

array([[ 2. ,  2. ,  2. ],
       [ 3. ,  3. ,  3. ],
       [33. , 33. , 33. ],
       [ 0.5,  0.5,  0.5]])

4. Higher order ufunc usage

4.1 ufunc Example method

It's kind of like pandas in ,agg A function like this

# reduce  Continuous use ufunc（ such as add,multiply）,reduce First establish the shape of the array in the gas function and which elements are going to be ufunc operation , Then I do ufunc operation 

arr = np.arange(10)
np.add.reduce(arr)

arr.sum()

arr = np.arange(1, 5)
np.multiply.reduce(arr)

np.prod(arr)

arr.prod()

np.random.seed(12346)
arr = np.random.randn(5, 5)
arr[::2]

array([[-8.99822478e-02,  7.59372617e-01,  7.48336101e-01,
        -9.81497953e-01,  3.65775545e-01],
       [ 2.48256116e-01, -3.21536673e-01, -8.48730755e-01,
         4.60468309e-04, -5.46459347e-01],
       [-6.49092950e-01, -4.79535727e-01, -9.53521432e-01,
         1.42253882e+00,  1.75403128e-01]])

arr[::2].sort(1)

arr[::2]

array([[-9.81497953e-01, -8.99822478e-02,  3.65775545e-01,
         7.48336101e-01,  7.59372617e-01],
       [-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
         4.60468309e-04,  2.48256116e-01],
       [-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
         1.75403128e-01,  1.42253882e+00]])

arr

array([[-9.81497953e-01, -8.99822478e-02,  3.65775545e-01,
         7.48336101e-01,  7.59372617e-01],
       [-3.15442628e-01, -8.66135605e-01,  2.78568155e-02,
        -4.55597723e-01, -1.60189223e+00],
       [-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
         4.60468309e-04,  2.48256116e-01],
       [ 2.53915229e-01,  1.93684246e+00, -7.99504902e-01,
        -5.69159281e-01,  4.89244731e-02],
       [-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
         1.75403128e-01,  1.42253882e+00]])

np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

np.logical_and.accumulate(arr[:, :-1] < arr[:, 1:], axis=1)

array([[ True,  True,  True,  True],
       [False, False, False, False],
       [ True,  True,  True,  True],
       [ True, False, False, False],
       [ True,  True,  True,  True]])

arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]])

np.all(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

np.logical_and.reduce(arr[:, [0]] < arr[:, [1]], axis=1)

array([ True, False,  True,  True,  True])

arr[:, [0]] < arr[:, [1]]

array([[ True],
       [False],
       [ True],
       [ True],
       [ True]])

#  Be similar to sum And cumsum, And recuce The corresponding is accumulate
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

# outer Return to a similar “ Exoproduct ” It's just not calculating the product , It's going on ufunc operation 
arr = np.arange(3).repeat([1, 2, 2])
arr

array([0, 1, 1, 2, 2])

np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

np.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

np.logical_and.outer(arr, np.arange(5))

array([[False, False, False, False, False],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True]])

# outer  The dimension of is two arrays shape Directly joining together 
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)

result.shape

(3, 4, 5)

# reduceat  amount to groupby
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17])

arr = np.multiply.outer(np.arange(4), np.arange(5))
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

4.2 Use python Write a new ufunc Method

#  utilize  numpy.vectorize
def add_elements(x, y):
    return x + y

add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

add_them(np.random.randn(2, 2), np.random.randn(2, 2))

array([[-1.33080793, -1.43407981],
       [ 0.15584993,  1.0519004 ]])

arr = np.random.randn(10000)
%timeit add_them(arr, arr)

1.18 ms ± 15.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

arr = np.random.randn(10000)
%timeit np.add(arr, arr)

2.53 µs ± 43.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

5. Structured and recorded arrays

import numpy as np

#  utilize (field_name, field_data_type) As dtype The tuple list of 
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5       ,  6), (3.14159265, -2)],
      dtype=[('x', '<f8'), ('y', '<i4')])

sarr[0]

(1.5, 6)

sarr['x']

array([1.5       , 3.14159265])

sarr['y']

array([ 6, -2], dtype=int32)

sarr['x'].dtype.name

'float64'

sarr.dtype.name

'void96'

sarr.dtype.names

('x', 'y')

sarr.dtype

dtype([('x', '<f8'), ('y', '<i4')])

sarr[0]['y']

5.1 nesting dtype And multidimensional fields

#  Can be directed to dtype Pass one more shape , Used to specify the number of actions 
dtype = [('x', np.int64, 3), ('y', np.int32)]

arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

np.array([1, 2, 3, 4], dtype=dtype)

array([([1, 1, 1], 1), ([2, 2, 2], 2), ([3, 3, 3], 3), ([4, 4, 4], 4)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

arr[0]

([0, 0, 0], 0)

arr[0]['x']

array([0, 0, 0])

arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 3), ((3, 4), 5)], dtype=dtype)
data

array([((1., 2.), 3), ((3., 4.), 5)],
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

data['x']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

data['y']

array([3, 5], dtype=int32)

data['x']['a']

array([1., 3.])

6. More about sorting

import numpy as np

# ndarray  Of  sort  Example method   And  python  The built-in list sorting is similar , It is a sort of in situ , Instead of generating a new array 
arr = np.random.randn(6)
arr.sort()
arr

array([-1.20245647, -1.06934741,  0.23694375,  0.46847888,  1.33116886,
        2.07072179])

arr = np.random.randn(3, 5)
arr

array([[-0.1373051 ,  0.8347231 , -2.13610283,  0.6911535 ,  1.29073812],
       [-0.89613231,  0.40151617,  1.19168597, -0.3273313 ,  1.15674067],
       [-0.59111152, -0.13488416,  0.13590381,  0.07592941, -0.92518222]])

arr[:, 0].sort()

arr

array([[-0.89613231,  0.8347231 , -2.13610283,  0.6911535 ,  1.29073812],
       [-0.59111152,  0.40151617,  1.19168597, -0.3273313 ,  1.15674067],
       [-0.1373051 , -0.13488416,  0.13590381,  0.07592941, -0.92518222]])

#  Unlike the example method ,numpy.sort  What is produced is a new copy 
arr = np.random.randn(5)
arr

array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966,  0.53958451])

np.sort(arr)

array([-2.1607857 , -1.74567649, -0.98139601, -0.93574966,  0.53958451])

arr

array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966,  0.53958451])

#  You can specify the axis 
arr = np.random.randn(3, 5)
arr

array([[-2.9614586 ,  0.43584634, -0.27190644,  0.03146461,  0.21746412],
       [ 0.4819753 , -1.1517702 ,  1.59403466, -0.51082439,  0.05183487],
       [ 0.6184096 , -1.34489717,  0.05997099,  1.23059888,  1.84840695]])

arr.sort(axis=1)

arr

array([[-2.9614586 , -0.27190644,  0.03146461,  0.21746412,  0.43584634],
       [-1.1517702 , -0.51082439,  0.05183487,  0.4819753 ,  1.59403466],
       [-1.34489717,  0.05997099,  0.6184096 ,  1.23059888,  1.84840695]])

#  Reverse order method 
arr[:, ::-1]

array([[ 0.43584634,  0.21746412,  0.03146461, -0.27190644, -2.9614586 ],
       [ 1.59403466,  0.4819753 ,  0.05183487, -0.51082439, -1.1517702 ],
       [ 1.84840695,  1.23059888,  0.6184096 ,  0.05997099, -1.34489717]])

6.1 Indirect sort ： argsort and lexsort

values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer

array([1, 2, 4, 3, 0])

values[indexer]

array([0, 1, 2, 3, 5])

arr = np.random.randn(3, 5)
arr[0] = values

arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [-1.27871978,  1.65385215,  1.04044587,  0.89253023,  0.12713788],
       [ 1.58960486,  1.06406754,  0.06449551,  0.17571087,  1.35782749]])

arr[:, indexer]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [ 1.65385215,  1.04044587,  0.12713788,  0.89253023, -1.27871978],
       [ 1.06406754,  0.06449551,  1.35782749,  0.17571087,  1.58960486]])

arr[:, arr[0].argsort()]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [ 1.65385215,  1.04044587,  0.12713788,  0.89253023, -1.27871978],
       [ 1.06406754,  0.06449551,  1.35782749,  0.17571087,  1.58960486]])

lexsort And argsort similar , However, it performs indirect dictionary sorting on multi key arrays

first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name)) #  Execute first first_name Sort , Then execute on its basis last_name Sort 
sorter

array([1, 2, 3, 0, 4])

zip(last_name[sorter], first_name[sorter])

<zip at 0x7fd37101f1c0>

[x for x in zip(last_name[sorter], first_name[sorter])]

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

6.2 Other sorting algorithms

values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer

array([2, 3, 4, 0, 1])

values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

indexer = key.argsort(kind='quicksort')
indexer

array([2, 3, 4, 0, 1])

indexer = key.argsort(kind='heapsort')
indexer

array([4, 2, 3, 1, 0])

values.take(indexer)

array(['1:third', '1:first', '1:second', '2:second', '2:first'],
      dtype='<U8')

6.3 Partial sort of array

# numpy.partition(arr, inx, axis), among inx Represents the smallest number in the array , Then let the number smaller than this beat in front of it , The bigger ones are next 
np.random.seed(12345)
arr = np.random.randn(20)
arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

np.partition(arr, 3)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

np.sort(arr)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.43856974,
       -0.37184254, -0.20470766,  0.09290788,  0.22891288,  0.27499163,
        0.28174615,  0.47894334,  0.76902257,  0.88642934,  1.00718936,
        1.24643474,  1.35291684,  1.39340583,  1.66902531,  1.96578057])

np.partition(arr, 10)

array([-1.29622111, -0.43856974, -0.51943872, -0.5557303 , -0.37184254,
       -2.00163731, -0.20470766,  0.09290788,  0.22891288,  0.27499163,
        0.28174615,  1.00718936,  0.47894334,  0.76902257,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

indices = np.argpartition(arr, 3) #  Index 
indices

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9])

arr.take(indices)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

arr.take(indices) == np.partition(arr, 3)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

6.4 numpy.searchsorted： Look for elements in the sorted array

arr = np.array([0, 8, 11, 8, 15])
arr.searchsorted(9)

arr = np.array([0, 1, 7, 12, 15])

arr.searchsorted(9)

arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5])

arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])

array([0, 3])

arr.searchsorted([0, 1], side='right')

array([3, 7])

#  A typical application 
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])

labels = bins.searchsorted(data)

labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3])

#  coordination pandas Of groupby, Clustering can be achieved 
import pandas as pd

pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

7. Use Numba Write fast Numpy function

import numpy as np

#  Now consider a python Its own function , See how slow it is 
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

x = np.random.randn(10000000)
y = np.random.randn(10000000)
%timeit mean_distance(x, y)

2.68 s ± 100 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%timeit (x - y).mean()

26.9 ms ± 549 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

#  use numba.jit Function compiles the above function into Numba function ：
import numba as nb
numba_mean_distance = nb.jit(mean_distance)

#  Equivalent method 
@nb.jit
def numba_mean_distance2(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

%timeit numba_mean_distance(x, y) #  Even better than numpy The function of is also fast

14.6 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Be careful Numba You can't put any python Code compiled into machine code , Support pure python An important subset of the code .

%timeit numba_mean_distance2(x, y)

16.3 ms ± 911 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Numba Replace the call of a function that does not know how to compile with CPython API.Numba Of jit Function has an option ,nopython=True, Limit the allowed code to compile to LLVM Of Python Code , Without calling any Python Of C Language API.jit(nopython=True) There is a short alias numba.njit

from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def njit_mean_distance(x, y):
    return (x - y).mean()

%timeit njit_mean_distance(x, y)

36.9 ms ± 174 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

7.1 Use Numba Create custom numpy.ufunc object

from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y

x = np.arange(10)
nb_add(x, x)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

nb_add.accumulate(x, axis=0) #  namely , Build a compiled Numpy ufunc, Its behavior is similar to the built-in numpy The function is the same

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

8. High order array input and output

8.1 Memory mapped files

It is a method of interacting with binary data on disk , Just like it is stored in memory array . Allow large files to be read and written in a small stack .

#  Create memory maps , Use np.memmap Incoming file path 、dtype、shape And file mode ：
mmap = np.memmap('mymap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

# memmap The slice of returns a view of the data on the hard disk , If you assign data to slices , Then it will be buffered in memory （ Like a python File object ）, You can call flush Write to hard disk .
section = mmap[:5]

section.shape

(5, 10000)

section[:] = np.random.randn(5, 10000)
section

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        [-0.6104341 ,  0.64413784,  0.42810329, ...,  1.25154416,
          0.34818979,  0.80809682],
        [ 0.21072709,  0.09675299, -0.10433349, ...,  1.22574256,
         -0.20164288,  0.46595202]])

mmap

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

mmap.flush()

mmap

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

del mmap #  Delete in memory

mmap

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-35-49f7bdb78b33> in <module>
----> 1 mmap


NameError: name 'mmap' is not defined

When memory mapping is out of range and garbage collected , Any changes will be refreshed to the hard disk . Because there is only binary data in the hard disk , Metadata without other information , You must specify dtype and shape. among dtype It also applies to structured or nested dtype.

mmap = np.memmap('mymap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

8.2 HDF5 And other array storage options

HDF（Hierarchical Data Format） Hierarchical data format , Is a compressible array format .

9. Performance skills

take python Loop and conditional logic are converted to array operations and Boolean array operations
Use radio whenever possible
Use array view （ section ） Avoid copying data
Use ufunc and ufunc Method

9.1 The importance of continuous memory

According to the locality principle of program access , When in the original order of the array （C perhaps F） visit , Can avoid stepping , meanwhile Cache The hit rate is higher . It can be used ndarray Of flags Property check order .

arr_c = np.ones((5000, 5000), order='C')
arr_f = np.ones((5000, 5000), order='F')
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

arr_f.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

arr_f.flags.f_contiguous

True

arr_f.flags.c_contiguous

False

arr_c.flags.c_contiguous

True

%timeit arr_c.sum(axis=1)

13.1 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit arr_c.sum(axis=0)

13.7 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit arr_f.sum(axis=0)

11.6 ms ± 935 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit arr_f.sum(axis=1)

15.2 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

#  Switch to the desired order , You can use transpose , perhaps copy
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

#  Build views on arrays , The results are not necessarily continuous 
arr_c[:50].flags.contiguous

True

arr_c[:50].flags.c_contiguous

True

arr_c[:50].flags.f_contiguous

False

arr_c[:, :50].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

arr_c[:, :50].flags.contiguous

False

原网站

版权声明
本文为[Ml -- xiaoxiaobai]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/205/202207240516576222.html