numpy数组按某一维度相加_Python数据分析之NumPy(高级篇)

  • Post author:
  • Post category:python

96b381ea02ee48d2994d37c720a65216

一些更高级的ndarray处理

where和一些其他的逻辑运算

np.where(cond,x,y):满足条件(cond)输出x,不满足输出y

x_arr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])y_arr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])cond = np.array([True, False, True, True, False])print(np.where(cond, x_arr, y_arr))
[ 1.1  2.2  1.3  1.4  2.5]
arr = np.random.randn(4,4)print(arr)print(np.where(arr > 0, 2, -2))print(np.where(arr > 0, 2, arr))
[[ -1.10484247e+00  -3.82422727e-01  -3.24361549e-01   1.21286234e+00] [  1.54499855e-01  -4.77728163e-04   1.44621074e+00  -2.64241611e-03] [  1.36394862e+00   6.96638259e-02  -2.75237740e-01  -3.32892881e-01] [ -1.37165175e+00   1.79997993e-01  -1.13509664e-01   1.88373639e+00]][[-2 -2 -2  2] [ 2 -2  2 -2] [ 2  2 -2 -2] [-2  2 -2  2]][[ -1.10484247e+00  -3.82422727e-01  -3.24361549e-01   2.00000000e+00] [  2.00000000e+00  -4.77728163e-04   2.00000000e+00  -2.64241611e-03] [  2.00000000e+00   2.00000000e+00  -2.75237740e-01  -3.32892881e-01] [ -1.37165175e+00   2.00000000e+00  -1.13509664e-01   2.00000000e+00]]

np.where可以嵌套使用

cond_1 = np.array([True, False, True, True, False])cond_2 = np.array([False, True, False, True, False])result = np.where(cond_1 & cond_2, 0,           np.where(cond_1, 1, np.where(cond_2, 2, 3)))print(result)
[1 2 1 0 3]
arr = np.random.randn(10)print(arr)print((arr > 0).sum()) #数组中大于0的数相加
[ 0.27350655 -1.51093462  0.26835915 -0.45991855  1.34450904 -1.86871203  0.04308971  1.69640444 -0.02191351 -0.43875275]5
bools = np.array([False, False, True, False])print(bools.any()) # 有一个为True则返回Trueprint(bools.all()) # 有一个为False则返回False
TrueFalse

reshape(数组变形)

numpy可以很容易地把一维数组转成二维数组,三维数组。

import numpy as nparr = np.arange(8)print("(4,2):", arr.reshape((4,2)))print()print("(2,2,2):", arr.reshape((2,2,2)))
(4,2): [[0 1] [2 3] [4 5] [6 7]](2,2,2): [[[0 1]  [2 3]] [[4 5]  [6 7]]]

-1( 维度自动推算)

如果我们在某一个维度上写上-1,numpy会帮我们自动推导出正确的维度

arr = np.arange(15)print(arr.reshape((5,-1)))print(arr.reshape((5,-1)).shape)
[[ 0  1  2] [ 3  4  5] [ 6  7  8] [ 9 10 11] [12 13 14]](5, 3)

ravel(拉平数组)

# 高维数组用ravel来拉平成为一维数组arr = np.arange(15)print(arr.ravel())
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]

concatenate(连接数组)

arr1 = np.array([[1, 2, 3], [4, 5, 6]])arr2 = np.array([[7, 8, 9], [10, 11, 12]])print(np.concatenate([arr1, arr2], axis = 0))  # 按行连接print(np.concatenate([arr1, arr2], axis = 1))  # 按列连接
[[ 1  2  3] [ 4  5  6] [ 7  8  9] [10 11 12]][[ 1  2  3  7  8  9] [ 4  5  6 10 11 12]]

连接的另一种表述垂直stack与水平stack

print(np.vstack((arr1, arr2))) # 垂直堆叠print(np.hstack((arr1, arr2))) # 水平堆叠
[[ 1  2  3] [ 4  5  6] [ 7  8  9] [10 11 12]][[ 1  2  3  7  8  9] [ 4  5  6 10 11 12]]

split(拆分数组)

arr = np.random.rand(5,5)print(arr)
[[ 0.08218151  0.25291976  0.990262    0.74980044  0.92433676] [ 0.57215647  0.88759783  0.67939949  0.18618301  0.64810013] [ 0.21424794  0.5812622   0.33170632  0.40780156  0.00946797] [ 0.46223634  0.53574553  0.25289433  0.33226224  0.26110024] [ 0.81823359  0.98863697  0.13713923  0.3520669   0.38301044]]
first, second, third = np.split(arr, [1,3], axis = 0) # 按行拆分print(first)print()print(second)print()print(third)
[[ 0.08218151  0.25291976  0.990262    0.74980044  0.92433676]][[ 0.57215647  0.88759783  0.67939949  0.18618301  0.64810013] [ 0.21424794  0.5812622   0.33170632  0.40780156  0.00946797]][[ 0.46223634  0.53574553  0.25289433  0.33226224  0.26110024] [ 0.81823359  0.98863697  0.13713923  0.3520669   0.38301044]]
first, second, third = np.split(arr, [1, 3], axis = 1) # 按列拆分print(first)print()print(second)print()print(third)
[[ 0.08218151] [ 0.57215647] [ 0.21424794] [ 0.46223634] [ 0.81823359]][[ 0.25291976  0.990262  ] [ 0.88759783  0.67939949] [ 0.5812622   0.33170632] [ 0.53574553  0.25289433] [ 0.98863697  0.13713923]][[ 0.74980044  0.92433676] [ 0.18618301  0.64810013] [ 0.40780156  0.00946797] [ 0.33226224  0.26110024] [ 0.3520669   0.38301044]]

堆叠辅助

arr = np.arange(6)arr1 = arr.reshape((3, 2))arr2 = np.random.randn(3, 2)#r_用于按行堆叠print(np.r_[arr1, arr2])print()#c_用于按列堆叠print(np.c_[np.r_[arr1, arr2], arr])print()#切片直接转为数组print(np.c_[1:6, -10:-5])print()
[[ 0.          1.        ] [ 2.          3.        ] [ 4.          5.        ] [ 0.04811148 -1.93674347] [ 1.19646481  0.17346639] [-1.4388562  -1.41584843]][[ 0.          1.          0.        ] [ 2.          3.          1.        ] [ 4.          5.          2.        ] [ 0.04811148 -1.93674347  3.        ] [ 1.19646481  0.17346639  4.        ] [-1.4388562  -1.41584843  5.        ]][[  1 -10] [  2  -9] [  3  -8] [  4  -7] [  5  -6]]

repeat(数组重复)

repeat(a,repeats, axis=None)

  • 按元素重复
arr = np.arange(3)print(arr.repeat(3))print(arr.repeat([2,3,4]))print()
[0 0 0 1 1 1 2 2 2][0 0 1 1 1 2 2 2 2]
  • 指定axis来重复
arr = np.arange(4)print(arr)
[[ 0.468845    0.43227877] [ 0.13822954  0.14501615]]
print(arr.repeat(2, axis=0))print(arr.repeat(2, axis=1))
[[ 0.468845    0.43227877] [ 0.468845    0.43227877] [ 0.13822954  0.14501615] [ 0.13822954  0.14501615]][[ 0.468845    0.468845    0.43227877  0.43227877] [ 0.13822954  0.13822954  0.14501615  0.14501615]]

tile(按规则重复数组)

tile通过重复给定的次数来构造数组。tile(A, reps):初始数组是A,重复规则是reps。reps表示数组A需要重复的次数、结果的行数。

arr = np.arange(4).reshape((2, 2))print(np.tile(arr, 2))print(np.tile(arr, (2,3)))
[[0 1 0 1] [2 3 2 3]][[0 1 0 1 0 1] [2 3 2 3 2 3] [0 1 0 1 0 1] [2 3 2 3 2 3]]

numpy的文件输入输出

读取csv文件作为数组

import numpy as nparr = np.loadtxt('array_ex.txt', delimiter=',')print(arr)
[[ 0.580052  0.18673   1.040717  1.134411] [ 0.194163 -0.636917 -0.938659  0.124094] [-0.12641   0.268607 -0.695724  0.047428] [-1.484413  0.004176 -0.744203  0.005487] [ 2.302869  0.200131  1.670238 -1.88109 ] [-0.19323   1.047233  0.482803  0.960334]]

数组文件读写

arr = np.arange(10)np.save('some_array', arr)
print(np.load('some_array.npy'))
[0 1 2 3 4 5 6 7 8 9]

多个数组可以一起压缩存储

arr2 = np.arange(15).reshape(3,5)np.savez('array_archive.npz', a=arr, b=arr2)
arch = np.load('array_archive.npz')print(arch['a'])print(arch['b'])
[0 1 2 3 4 5 6 7 8 9][[ 0  1  2  3  4] [ 5  6  7  8  9] [10 11 12 13 14]]

用numpy写一个softmax

步骤:

  • 数据预处理
  • 计算exponential
  • 每行求和
  • 每一行除以计算的和
import numpy as np# 产生(10,10)随机数m = np.random.rand(10, 10) * 10 + 1000print(m)
[[ 1002.4195769   1000.59428635  1004.19947044  1009.17641327   1004.89329928  1001.02496808  1007.79619575  1005.61568017   1009.28511386  1000.11608716] [ 1002.9870141   1005.59523328  1001.99337934  1008.79319814   1004.78921679  1003.91814186  1009.38777432  1005.20436416   1009.27099589  1008.69823987] [ 1006.68713949  1009.02893339  1008.2656608   1002.27620211  1009.2256124   1004.14144532  1007.09728075  1006.21626467  1004.60860132   1004.51547132] [ 1005.57757481  1001.6026775   1004.79229078  1004.28025577   1008.68219699  1005.6379599   1008.07958879  1006.35060616   1009.03418483  1003.50279599] [ 1003.22924339  1006.62272977  1008.5591972   1009.72498967   1004.49414198  1004.21450523  1008.32652935  1000.90418303   1009.24606203  1001.27113066] [ 1006.84865072  1005.24619541  1000.04356362  1003.38870582   1008.59759772  1008.80052236  1007.92905671  1006.16987466  1002.3761379   1001.55941284] [ 1006.80724007  1004.46597582  1003.25453387  1008.55713243   1009.19618236  1002.06897172  1004.69874948  1006.51535711   1005.23735087  1006.85265988] [ 1002.22993628  1000.59475018  1007.52711923  1000.36311206   1008.22254861  1003.94553055  1004.23517969  1005.26438502   1006.39421888  1005.22133756] [ 1006.92863693  1003.23688304  1007.11513614  1003.28880837   1009.11093137  1006.35136574  1002.04684923  1001.13114541   1008.50487627  1008.67481458] [ 1002.65347387  1001.90472796  1004.02149562  1009.63548587   1009.16220671  1006.39781332  1008.1526219   1003.57220839   1008.60930803  1004.41645034]]

直接对m进行e指数运算会产生上溢

print(np.exp(m))
[[ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf] [ inf  inf  inf  inf  inf  inf  inf  inf  inf  inf]]G:Anaconda3libsite-packagesipykernel_launcher.py:1: RuntimeWarning: overflow encountered in exp  """Entry point for launching an IPython kernel.

寻找每一行的最大值

#按列取最大值(即取每一行的最大值)m_row_max = m.max(axis=1).reshape(10,1)print(m_row_max, m_row_max.shape)
[[ 1009.28511386] [ 1009.38777432] [ 1009.2256124 ] [ 1009.03418483] [ 1009.72498967] [ 1008.80052236] [ 1009.19618236] [ 1008.22254861] [ 1009.11093137] [ 1009.63548587]] (10, 1)

通过广播的方式将每行数据减去对应行的最大值

# 采用广播的方式进行减法操作m = m - m_row_maxprint(m)
[[-6.86553696 -8.69082751 -5.08564343 -0.1087006  -4.39181458 -8.26014579  -1.48891811 -3.66943369  0.         -9.16902671] [-6.40076022 -3.79254104 -7.39439498 -0.59457618 -4.59855753 -5.46963247   0.         -4.18341016 -0.11677843 -0.68953445] [-2.5384729  -0.19667901 -0.95995159 -6.94941029  0.         -5.08416708  -2.12833165 -3.00934773 -4.61701107 -4.71014107] [-3.45661002 -7.43150733 -4.24189405 -4.75392907 -0.35198784 -3.39622493  -0.95459604 -2.68357867  0.         -5.53138884] [-6.49574628 -3.1022599  -1.16579247  0.         -5.23084769 -5.51048445  -1.39846033 -8.82080664 -0.47892764 -8.45385902] [-1.95187164 -3.55432696 -8.75695874 -5.41181655 -0.20292464  0.  -0.87146565 -2.63064771 -6.42438446 -7.24110952] [-2.3889423  -4.73020655 -5.94164849 -0.63904993  0.         -7.12721064  -4.49743288 -2.68082526 -3.95883149 -2.34352249] [-5.99261232 -7.62779843 -0.69542937 -7.85943655  0.         -4.27701805  -3.98736891 -2.95816359 -1.82832972 -3.00121104] [-2.18229443 -5.87404833 -1.99579523 -5.82212299  0.         -2.75956563  -7.06408214 -7.97978595 -0.6060551  -0.43611679] [-6.982012   -7.73075791 -5.61399025  0.         -0.47327916 -3.23767255  -1.48286397 -6.06327748 -1.02617783 -5.21903553]]

求预处理后的e指数

#求预处理后的e指数m_exp = np.exp(m)print(m_exp, m_exp.shape)
[[  1.04312218e-03   1.68120847e-04   6.18490628e-03   8.96998943e-01    1.23782475e-02   2.58621284e-04   2.25616615e-01   2.54909015e-02    1.00000000e+00   1.04217895e-04] [  1.66029460e-03   2.25382585e-02   6.14688467e-04   5.51796380e-01    1.00663457e-02   4.21278021e-03   1.00000000e+00   1.52464260e-02    8.89782323e-01   5.01809632e-01] [  7.89869284e-02   8.21454272e-01   3.82911421e-01   9.59200640e-04    1.00000000e+00   6.19404411e-03   1.19035722e-01   4.93238409e-02    9.88228942e-03   9.00350735e-03] [  3.15364890e-02   5.92294057e-04   1.43803289e-02   8.61776882e-03    7.03288672e-01   3.34994945e-02   3.84967625e-01   6.83182276e-02    1.00000000e+00   3.96048477e-03] [  1.50984802e-03   4.49475108e-02   3.11675571e-01   1.00000000e+00    5.34898908e-03   4.04414773e-03   2.46976935e-01   1.47629228e-04    6.19447308e-01   2.13076561e-04] [  1.42008035e-01   2.86006179e-02   1.57362462e-04   4.46352464e-03    8.16339758e-01   1.00000000e+00   4.18337963e-01   7.20317916e-02    1.62153108e-03   7.16516327e-04] [  9.17266523e-02   8.82464816e-03   2.62769434e-03   5.27793627e-01    1.00000000e+00   8.02955997e-04   1.11375513e-02   6.85065952e-02    1.90854027e-02   9.59889224e-02] [  2.49713221e-03   4.86731255e-04   4.98860204e-01   3.86091355e-04    1.00000000e+00   1.38840018e-02   1.85484526e-02   5.19141655e-02    1.60681727e-01   4.97268106e-02] [  1.12782462e-01   2.81146852e-03   1.35905535e-01   2.96131163e-03    1.00000000e+00   6.33192663e-02   8.55279590e-04   3.42312686e-04    5.45498570e-01   6.46542214e-01] [  9.28433319e-04   4.39111184e-04   3.64648989e-03   1.00000000e+00    6.22956140e-01   3.92551533e-02   2.26986674e-01   2.32676246e-03    3.58374111e-01   5.41254683e-03]] (10, 10)

将求指数后的数据按列加和(每行求和),然后将一维数据(10,)reshape成(10,1)

m_exp_row_sum = m_exp.sum(axis = 1).reshape(10,1)print(m_exp_row_sum, m_exp_row_sum.shape)
[[ 2.1682437 ] [ 2.99772713] [ 2.47775123] [ 2.24916138] [ 2.23431102] [ 2.4842771 ] [ 1.82649405] [ 1.79698532] [ 2.51101842] [ 2.26032542]] (10, 1)

每行的数据除以对应行e指数求和

m_softmax = m_exp / m_exp_row_sumprint(m_softmax)
[[  4.81090841e-04   7.75378004e-05   2.85249591e-03   4.13698398e-01    5.70888203e-03   1.19276853e-04   1.04055008e-01   1.17564744e-02    4.61202771e-01   4.80655820e-05] [  5.53851145e-04   7.51844898e-03   2.05051507e-04   1.84071584e-01    3.35799265e-03   1.40532478e-03   3.33586066e-01   5.08599528e-03    2.96818985e-01   1.67396701e-01] [  3.18784741e-02   3.31532183e-01   1.54539898e-01   3.87125483e-04    4.03591769e-01   2.49986522e-03   4.80418376e-02   1.99066962e-02    3.98841067e-03   3.63374146e-03] [  1.40214434e-02   2.63339955e-04   6.39364033e-03   3.83154756e-03    3.12689288e-01   1.48942156e-02   1.71160517e-01   3.03749780e-02    4.44610159e-01   1.76087176e-03] [  6.75755530e-04   2.01169445e-02   1.39495159e-01   4.47565264e-01    2.39402171e-03   1.81002005e-03   1.10538297e-01   6.60737144e-05    2.77243098e-01   9.53656673e-05] [  5.71627193e-02   1.15126521e-02   6.33433613e-05   1.79670965e-03    3.28602537e-01   4.02531586e-01   1.68394243e-01   2.89950713e-02    6.52717479e-04   2.88420453e-04] [  5.02200663e-02   4.83146833e-03   1.43865475e-03   2.88965424e-01    5.47496993e-01   4.39615994e-04   6.09777585e-03   3.75071549e-02    1.04492006e-02   5.25536464e-02] [  1.38962304e-03   2.70859896e-04   2.77609505e-01   2.14855041e-04    5.56487574e-01   7.72627449e-03   1.03219834e-02   2.88895880e-02    8.94173844e-02   2.76723522e-02] [  4.49150276e-02   1.11965269e-03   5.41236712e-02   1.17932692e-03    3.98244789e-01   2.52165679e-02   3.40610640e-04   1.36324243e-04    2.17241963e-01   2.57482067e-01] [  4.10752058e-04   1.94269011e-04   1.61325881e-03   4.42414172e-01    2.75604625e-01   1.73670361e-02   1.00422121e-01   1.02939269e-03    1.58549786e-01   2.39458743e-03]]

验证一下,对输出值进行按列求和,每行结果应该均为1

print(m_softmax.sum(axis=1))
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]

参考

[numpy指南]http://docs.scipy.org/doc/numpy/reference/

[numpy ndarray详解]https://danzhuibing.github.io/py_numpy_ndarray.html

[NumPy-快速处理数据]http://old.sebug.net/paper/books/scipydoc/numpy_intro.html