mirror of
https://github.com/Microsoft/sql-server-samples.git
synced 2025-12-08 14:58:54 +00:00
60 lines
1.7 KiB
Python
60 lines
1.7 KiB
Python
import sys
|
|
import numpy as np
|
|
from sklearn.pipeline import Pipeline
|
|
from datasource import DataSource
|
|
from pipeline import *
|
|
from revoscalepy.etl.RxImport import rx_import_datasource
|
|
from sklearn.metrics import classification_report
|
|
|
|
|
|
|
|
def run():
|
|
|
|
# modify connection string to point to MLS/SQL Server instance where you restored the database
|
|
connectionstring = 'Driver=SQL Server;Server=MLMACHINE\\SQLSERVER17;Database=velibdb;Trusted_Connection=True;'
|
|
|
|
ds = DataSource(connectionstring)
|
|
df = ds.loaddata()
|
|
|
|
|
|
pipeline = Pipeline(steps= [('outliers', OutliersHandler()),
|
|
('label',LabelDefiner()),
|
|
('dt', DateTimeFeaturesExtractor()),
|
|
('ts', TSFeaturesExtractor()),
|
|
('st', StatisticalFeaturesExtractor()),
|
|
('exclusion', FeaturesExcluder()),
|
|
('scaler', FeaturesScaler())]
|
|
)
|
|
|
|
# Execute Pipeline
|
|
|
|
df = pipeline.fit_transform(df)
|
|
|
|
# split dataset
|
|
|
|
test_size = 24 * 4 # one day test set of each station
|
|
train = df.groupby('stationid').head(df.shape[0] - test_size)
|
|
test = df.groupby('stationid').tail(test_size)
|
|
|
|
|
|
# fit classifier
|
|
|
|
clf = RxClassifier(computecontext = ds.getcomputecontext())
|
|
coeffs = clf.fit(train)
|
|
#print coefficients and exclude stationid Factor
|
|
print(coeffs.tail(14))
|
|
|
|
|
|
# run prediction on hold out set and evaluate
|
|
|
|
y_pred = clf.predict(test.drop(['label'], axis=1, inplace = False))
|
|
y_truth = test['label'].as_matrix()
|
|
print(classification_report(y_truth, y_pred))
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|
|
|
|
|