1
0
mirror of https://github.com/Microsoft/sql-server-samples.git synced 2025-12-08 14:58:54 +00:00
Files
2017-05-30 10:03:01 +02:00

60 lines
1.7 KiB
Python

import sys
import numpy as np
from sklearn.pipeline import Pipeline
from datasource import DataSource
from pipeline import *
from revoscalepy.etl.RxImport import rx_import_datasource
from sklearn.metrics import classification_report
def run():
# modify connection string to point to MLS/SQL Server instance where you restored the database
connectionstring = 'Driver=SQL Server;Server=MLMACHINE\\SQLSERVER17;Database=velibdb;Trusted_Connection=True;'
ds = DataSource(connectionstring)
df = ds.loaddata()
pipeline = Pipeline(steps= [('outliers', OutliersHandler()),
('label',LabelDefiner()),
('dt', DateTimeFeaturesExtractor()),
('ts', TSFeaturesExtractor()),
('st', StatisticalFeaturesExtractor()),
('exclusion', FeaturesExcluder()),
('scaler', FeaturesScaler())]
)
# Execute Pipeline
df = pipeline.fit_transform(df)
# split dataset
test_size = 24 * 4 # one day test set of each station
train = df.groupby('stationid').head(df.shape[0] - test_size)
test = df.groupby('stationid').tail(test_size)
# fit classifier
clf = RxClassifier(computecontext = ds.getcomputecontext())
coeffs = clf.fit(train)
#print coefficients and exclude stationid Factor
print(coeffs.tail(14))
# run prediction on hold out set and evaluate
y_pred = clf.predict(test.drop(['label'], axis=1, inplace = False))
y_truth = test['label'].as_matrix()
print(classification_report(y_truth, y_pred))
if __name__ == "__main__":
run()