mirror of
https://github.com/Microsoft/sql-server-samples.git
synced 2025-12-08 14:58:54 +00:00
70 lines
3.1 KiB
R
70 lines
3.1 KiB
R
####################################################################################################
|
|
## Title: Telco Customer Churn
|
|
## Description: Data Preparation
|
|
## Author: Microsoft
|
|
## Note: Prepare the training and testing data sets by pre-processing and spliting on raw data
|
|
####################################################################################################
|
|
|
|
dataPreparation <- function(sqlSettings, trainTable, testTable) {
|
|
sqlConnString <- sqlSettings$connString
|
|
|
|
## Query necessary columns from the call detail record table
|
|
dataVars <- rxGetVarNames(cdrSQL)
|
|
dataVars <- dataVars[!dataVars %in% c("year", "month")]
|
|
dataVars <- paste(dataVars, collapse = ", ")
|
|
dataQuery <- paste("select", dataVars, "from", inputTable)
|
|
|
|
## Create sql server data sources
|
|
inputDataSQL = RxSqlServerData(sqlQuery = dataQuery,
|
|
connectionString = sqlConnString,
|
|
colInfo = cdrColInfo)
|
|
trainDataSQL <- RxSqlServerData(connectionString = sqlConnString,
|
|
table = trainTable,
|
|
colInfo = cdrColInfo)
|
|
testDataSQL <- RxSqlServerData(connectionString = sqlConnString,
|
|
table = testTable,
|
|
colInfo = cdrColInfo)
|
|
|
|
## Data pre-processing: cleaning and splitting followed by SMOTE
|
|
rxExec(preProcess, inData = inputDataSQL, outData1 = trainDataSQL, outData2 = testDataSQL)
|
|
}
|
|
|
|
preProcess <- function(inData, outData1, outData2) {
|
|
## Clean missing data
|
|
## Remove duplicate rows
|
|
cdrDF <- rxDataStep(inData = inData,
|
|
removeMissings = TRUE,
|
|
overwrite = TRUE)
|
|
cdrDF <- cdrDF[!duplicated(cdrDF),]
|
|
|
|
## Split data
|
|
set.seed(1234)
|
|
splitFile <- rxSplit(inData = cdrDF,
|
|
outFilesBase = "trainTestData",
|
|
splitByFactor = "ind",
|
|
transforms = list(ind = factor(sample(0:1, size = .rxNumRows, replace = TRUE, prob = c(0.3, 0.7)),
|
|
levels = 0:1,
|
|
labels = c("Test", "Train"))),
|
|
overwrite = TRUE)
|
|
trainFile <- splitFile[[2]]
|
|
testFile <- splitFile[[1]]
|
|
|
|
## SMOTE on training data
|
|
trainDF <- rxDataStep(inData = trainFile, varsToDrop = c("ind"))
|
|
testDF <- rxDataStep(inData = testFile, varsToDrop = c("ind"))
|
|
|
|
library(unbalanced)
|
|
trainVars <- names(trainDF)
|
|
trainVarsInd <- trainVars %in% c("churn")
|
|
smotetrain <- ubSMOTE(X = trainDF[!trainVarsInd], Y = trainDF$churn,
|
|
perc.over = 200, perc.under = 500,
|
|
k = 3, verbose = TRUE)
|
|
smotetrainDF <- cbind(smotetrain$X, smotetrain$Y)
|
|
names(smotetrainDF)[names(smotetrainDF) == "smotetrain$Y"] <- "churn"
|
|
trainDF <- smotetrainDF
|
|
|
|
## Load final training data and testing data into SQL
|
|
rxDataStep(inData = trainDF, outFile = outData1, overwrite = TRUE)
|
|
rxDataStep(inData = testDF, outFile = outData2, overwrite = TRUE)
|
|
}
|