90 lines
2.1 KiB
Python
90 lines
2.1 KiB
Python
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.preprocessing import MinMaxScaler
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
|
||
def data_format(data_path, is_column=False, rate=0.25):
|
||
"""_summary_
|
||
|
||
Args:
|
||
data_path (_type_): 数据路径
|
||
is_column (bool, optional): 是否为列数据. Defaults to False.
|
||
rate (float, optional): 实验集划分的比例. Defaults to 0.25.
|
||
md:模式,0为分类,1为预测
|
||
|
||
Returns:X_train, X_test, Y_train, Y_test
|
||
_type_: np.array
|
||
"""
|
||
# 读入数据
|
||
X = data_load_forecast(data_path, is_column)
|
||
|
||
# 归一化数据
|
||
sc = MinMaxScaler(feature_range=(-1, 1))
|
||
X = sc.fit_transform(X)
|
||
|
||
# 分离Y
|
||
# 分离第 128 个元素
|
||
Y = X[:, -1]
|
||
# 分离前 127 个元素
|
||
X = X[:, :-1]
|
||
|
||
# 划分数据集,75%用于训练,25%用于测试
|
||
X_train, X_test, Y_train, Y_test = train_test_split(
|
||
X, Y, test_size=rate, random_state=7)
|
||
|
||
return X_train, X_test, Y_train, Y_test
|
||
|
||
|
||
def data_load_forecast(data_path, is_column=False):
|
||
"""
|
||
数据加载
|
||
data_path: 数据路径
|
||
is_column: 是否是列数据
|
||
return:X,Y
|
||
"""
|
||
# 读取csv文件
|
||
df = pd.read_csv(data_path)
|
||
|
||
# 进行数据清洗
|
||
data_clean(df, is_column)
|
||
df = df[df['output'] == 1]
|
||
|
||
# 去除第一列
|
||
df = df.drop(df.columns[0], axis=1)
|
||
|
||
# 初始化X,Y
|
||
X = []
|
||
|
||
# 遍历DataFrame的每一行
|
||
for index, row in df.iterrows():
|
||
# 获取前127个数据项
|
||
X.append(row.iloc[0:128])
|
||
|
||
return np.array(X)
|
||
|
||
|
||
def data_clean(data, is_column=False):
|
||
"""_summary_
|
||
|
||
Args:
|
||
data (_type_): csv数据
|
||
is_column (bool, optional): 清除含有NaN数据的列. Defaults to False.即清除含有NaN数据的行
|
||
|
||
Returns:
|
||
_type_: 清洗过的数据
|
||
"""
|
||
if not is_column:
|
||
data = data.dropna(axis=0)
|
||
return data
|
||
else:
|
||
data = data.dropna(axis=1)
|
||
return data
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 加载数据
|
||
X_train, X_test, Y_train, Y_test = data_format(
|
||
'data/archive/PowerQualityDistributionDataset1.csv')
|
||
print(X_train.shape)
|