import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from revoscalepy.computecontext.RxInSqlServer import RxInSqlServer from revoscalepy.computecontext.RxInSqlServer import RxSqlServerData from revoscalepy.etl.RxImport import rx_import_datasource def get_rental_predictions(): conn_str = 'Driver=SQL Server;Server=MYSQLSERVER;Database=TutorialDB;Trusted_Connection=True;' column_info = { "Year" : { "type" : "integer" }, "Month" : { "type" : "integer" }, "Day" : { "type" : "integer" }, "RentalCount" : { "type" : "integer" }, "WeekDay" : { "type" : "factor", "levels" : ["1", "2", "3", "4", "5", "6", "7"] }, "Holiday" : { "type" : "factor", "levels" : ["1", "0"] }, "Snow" : { "type" : "factor", "levels" : ["1", "0"] } } data_source = RxSqlServerData(table="dbo.rental_data", connectionString=conn_str, colInfo=column_info) computeContext = RxInSqlServer( connectionString = conn_str, numTasks = 1, autoCleanup = False ) RxInSqlServer(connectionString=conn_str, numTasks=1, autoCleanup=False) # import data source and convert to pandas dataframe df = pd.DataFrame(rx_import_datasource(data_source)) print("Data frame:", df) # Get all the columns from the dataframe. columns = df.columns.tolist() # Filter the columns to remove ones we don't want. columns = [c for c in columns if c not in ["Year"]] # Store the variable we'll be predicting on. target = "RentalCount" # Generate the training set. Set random_state to be able to replicate results. train = df.sample(frac=0.8, random_state=1) # Select anything not in the training set and put it in the testing set. test = df.loc[~df.index.isin(train.index)] # Print the shapes of both sets. print("Training set shape:", train.shape) print("Testing set shape:", test.shape) # Initialize the model class. lin_model = LinearRegression() # Fit the model to the training data. lin_model.fit(train[columns], train[target]) # Generate our predictions for the test set. lin_predictions = lin_model.predict(test[columns]) print("Predictions:", lin_predictions) # Compute error between our test predictions and the actual values. lin_mse = mean_squared_error(lin_predictions, test[target]) print("Computed error:", lin_mse) get_rental_predictions()