如果数据量非常大,建议在linux下执行,本地执行可能需要扩充虚拟内存
代码:
#csv文件格式转化为libsvm文件格式
import pandas as pd
import time
def libsvm(df, fp):
now = time.time()
print('Format Converting begin in time:..........',now)
columns = df.columns.values #第一行作为列名[pm2_5,so2,no2,co,o3]
d = len(columns) #d=5,共有5列
feature_index = [i for i in range(d)] #feature_index=[0,1,2,3,4]
field_index = [0]*d #field_index=[0,0,0,0,0]
field = []
for col in columns:
field.append(col.split(',')[0]) #field=['94', '108', '79', '5', '2'],分隔符
index = -1
for i in range(d):
if i==0 or field[i]!=field[i-1]:
index+=1
field_index[i] = index #field_index=[0,1,2,3,4]
with open(fp, 'w') as f:
for row in df.values: #一一遍历一行的值,row是一维数组
line =str(row[0]) #line等于row的第一个元素值
for i in range(1, len(row)): #i每次都等于1,2,3,4
if (row[i] != 0) & (type(row[i])!=str): #当数据不为0且不为字符串时写入
line += " %d:%.3f" % (feature_index[i], row[i])
line+='\n'
f.write(line)
print('finish convert,the cost time is ',time.time()-now)
print('[Done]')
print()
def main():
df = pd.read_csv(r'/home/wanjintao/PM_sourcedata.csv')
df = df.fillna(0) #df.fillna(0)用0填充缺失值,df.dropna()删除缺失值
fp = r'/home/wanjintao/PM2.5_libsvm.txt'
libsvm(df,fp)
if __name__ == '__main__':
main()
执行效果:
原数据:
生成的libsvm数据:
版权声明:本文为wjt199866原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。