AdrianSQL
diff --git a/‎samples/features/r-services/Telco Customer Churn v3/Data/edw_cdr.csv‎
Lines changed: 20469 additions & 0 deletions b/‎samples/features/r-services/Telco Customer Churn v3/Data/edw_cdr.csv‎
Lines changed: 20469 additions & 0 deletions
diff --git a/‎samples/features/r-services/Telco Customer Churn v3/Data/state_latlon.csv‎
Lines changed: 56 additions & 0 deletions b/‎samples/features/r-services/Telco Customer Churn v3/Data/state_latlon.csv‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎samples/features/r-services/Telco Customer Churn v3/R/README.md‎
Lines changed: 20 additions & 0 deletions b/‎samples/features/r-services/Telco Customer Churn v3/R/README.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎samples/features/r-services/Telco Customer Churn v3/R/telcoChurn-dataExploration.R‎
Lines changed: 157 additions & 0 deletions b/‎samples/features/r-services/Telco Customer Churn v3/R/telcoChurn-dataExploration.R‎
Lines changed: 157 additions & 0 deletions
diff --git a/‎samples/features/r-services/Telco Customer Churn v3/R/telcoChurn-dataPreparation.R‎
Lines changed: 69 additions & 0 deletions b/‎samples/features/r-services/Telco Customer Churn v3/R/telcoChurn-dataPreparation.R‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎samples/features/r-services/Telco Customer Churn v3/R/telcoChurn-evaluate.R‎
Lines changed: 40 additions & 0 deletions b/‎samples/features/r-services/Telco Customer Churn v3/R/telcoChurn-evaluate.R‎
Lines changed: 40 additions & 0 deletions
@@ -0,0 +1,56 @@
+"state","latitude","longitude"
+AK,61.3850,-152.2683
+AL,32.7990,-86.8073
+AR,34.9513,-92.3809
+AS,14.2417,-170.7197
+AZ,33.7712,-111.3877
+CA,36.1700,-119.7462
+CO,39.0646,-105.3272
+CT,41.5834,-72.7622
+DC,38.8964,-77.0262
+DE,39.3498,-75.5148
+FL,27.8333,-81.7170
+GA,32.9866,-83.6487
+HI,21.1098,-157.5311
+IA,42.0046,-93.2140
+ID,44.2394,-114.5103
+IL,40.3363,-89.0022
+IN,39.8647,-86.2604
+KS,38.5111,-96.8005
+KY,37.6690,-84.6514
+LA,31.1801,-91.8749
+MA,42.2373,-71.5314
+MD,39.0724,-76.7902
+ME,44.6074,-69.3977
+MI,43.3504,-84.5603
+MN,45.7326,-93.9196
+MO,38.4623,-92.3020
+MP,14.8058,145.5505
+MS,32.7673,-89.6812
+MT,46.9048,-110.3261
+NC,35.6411,-79.8431
+ND,47.5362,-99.7930
+NE,41.1289,-98.2883
+NH,43.4108,-71.5653
+NJ,40.3140,-74.5089
+NM,34.8375,-106.2371
+NV,38.4199,-117.1219
+NY,42.1497,-74.9384
+OH,40.3736,-82.7755
+OK,35.5376,-96.9247
+OR,44.5672,-122.1269
+PA,40.5773,-77.2640
+PR,18.2766,-66.3350
+RI,41.6772,-71.5101
+SC,33.8191,-80.9066
+SD,44.2853,-99.4632
+TN,35.7449,-86.7489
+TX,31.1060,-97.6475
+UT,40.1135,-111.8535
+VA,37.7680,-78.2057
+VI,18.0001,-64.8199
+VT,44.0407,-72.7093
+WA,47.3917,-121.5708
+WI,44.2563,-89.6385
+WV,38.4680,-80.9696
+WY,42.7475,-107.2085
@@ -0,0 +1,20 @@
+**Instructions**
+
+
+- Run the telcoChurn-main.R to drive the R demo
+- Run the telcoChurn-modelComparison.R to compare different algorithms that we tried to build churn models
+
+
+
+----------
+**Description**
+
+- **telcoChurn-setUp.R** - Setting up relevant R packages
+- **telcoChurn-evaluate.R** - Defining pre-functions for model evaluation 
+- **telcoChurn-dataExploration.R** - Creating a Shiny application to explore and visualize the data
+- **telcoChurn-dataPreparation.R** - Defining functions to do data pre-processing and spliting in order to generate suitable training and testing data sets
+- **telcoChurn-trainModel.R** - Defining a function to train the telco churn model with mxFastTree algorithm
+- **telcoChurn-main.R** - Main R file driving the demo execution
+- **telcoChurn-modelComparison.R** - R file to build and compare different tree-based classification models, including CRAN R algorithms - randomForest and xgboost, RevoScaleR algorithms – rxDForest and rxBTrees, as well as MicrosoftRML algorithms – mxFasttree and mxFastforest
+
+----------
@@ -0,0 +1,157 @@
+####################################################################################################
+## Title: Telco Customer Churn
+## Description: Data Exploration and Visualization
+## Author: Microsoft
+####################################################################################################
+
+library(shiny)
+library(leaflet)
+library(jsonlite)
+library(dplyr)
+library(ggplot2)
+
+## Load data from SQL
+# cdrDF <- rxImport(inData = cdrSQL)
+
+## Load data from local
+cdrFile <- file.path(wd, "Data", "edw_cdr.csv")
+cdrDF <- read.csv(file = cdrFile, header = TRUE, sep = ",")
+
+latlonFile <- file.path(wd, "Data", "state_latlon.csv")
+latlonDF <- read.csv(file = latlonFile, header = TRUE, sep = ",")
+
+data <- cdrDF %>%
+        group_by(state) %>%
+        summarise(complaintsbystate = sum(as.numeric(numberofcomplaints)),
+                  churnbystate = sum(as.numeric(churn))) %>%
+        mutate(lab = paste0("<center>", "state,", state, ": ", "<br>",
+                            "complaintsbystate,", complaintsbystate, "<br>",
+                            "churnbystate,", churnbystate, "</center>")) %>%
+        left_join(cdrDF, by = "state") %>%
+        left_join(latlonDF, by = "state")
+
+ui <- fluidPage(
+  tags$style(HTML("
+                  @import url('https://fonts.googleapis.com/css?family=Poppins');
+                  
+                  body {
+                  
+                  font-family: 'Poppins', 'Lucida Grande', Verdana, Lucida, Helvetica, Arial, Calibri, sans-serif;
+                  color: rgb(0,0,0);
+                  background-color: #d2d2d2;
+                  }
+                  ")),
+
+  titlePanel("Telco Customer Churn"),
+
+  # Sidebar with a slider input for number of bins 
+  sidebarLayout(
+    sidebarPanel(
+      sliderInput("sc", "Scale size of circles (also redraws map to show only the last added state)",
+                  min = 0.5, max = 5, value = 1, step = 0.1),
+      p(),
+      selectInput("state", "Select a state to add to the map",
+                  choices = c("", data$state), selected = "",
+                  size = , selectize = FALSE),
+      actionButton("clear1", "Clear all states"),
+      p(),
+      p("Proportion of customer churn"),
+      plotOutput("MyPlot1", height = "200px"),
+      p(),
+      p("Impact of education on churn"),
+      plotOutput("MyPlot2", height = "200px"),
+      p(),
+      p("Impact of call failure rate on churn"),
+      plotOutput("MyPlot3", height = "200px"),
+      h2("About"),
+      HTML("<p>Created with R and Shiny leaflet. R users can download the 
+           cleaned and tidy call detail record data from <a href = 'https://github.com/Microsoft/sql-server-samples/tree/master/samples/features/r-services/Telco%20Customer%20Churn'>
+           https://github.com/Microsoft/sql-server-samples/tree/master/samples/features/r-services/Telco%20Customer%20Churn</a>.  
+           The latitute and longitute for each USA state can be found from <a href = 'http://dev.maxmind.com/geoip/legacy/codes/state_latlon/'>
+           http://dev.maxmind.com/geoip/legacy/codes/state_latlon/</a>.")
+      ),
+
+
+    mainPanel(
+      leafletOutput("MyMap", height = 1000)
+
+    )
+      )
+    )
+
+server <- function(input, output, session) {
+
+    the_data_state <- reactive({
+        tmp <- data %>%
+      filter(state == input$state)
+
+        if (input$state != "") {
+            thecol <- data.frame(data)[data$state == input$state, "colour"]
+        } else {
+            tmp <- data[1,]
+            thecol <- NULL
+
+        }
+
+        return(list(df = tmp, thecol = thecol))
+    })
+
+    output$MyMap <- renderLeaflet({
+        leaflet() %>%
+      addProviderTiles("Stamen.Watercolor") %>%
+      addProviderTiles("Stamen.TonerLabels") %>%
+      fitBounds(-120, 30, -60, 50)
+    })
+
+    observe({
+        leafletProxy("MyMap", data = the_data_state()$df) %>%
+      addCircleMarkers( ~ longitude,
+                       ~ latitude,
+                       color = the_data_state()$thecol,
+                       radius = ~churnbystate * 0.1 * input$sc,
+                       popup = ~lab)
+    })
+
+    observe({
+        x <- input$clear1
+        updateSelectInput(session, "state", selected = "")
+        leafletProxy("MyMap") %>% clearMarkers()
+    })
+
+    observe({
+        x <- input$sc
+        leafletProxy("MyMap") %>% clearMarkers()
+    })
+
+
+    output$MyPlot1 <- renderPlot({
+        cdrDF %>%
+      ggplot(aes(x = factor(1), fill = factor(churn))) +
+      geom_bar(width = 1) +
+      coord_polar(theta = "y") +
+      theme_minimal()
+    })
+
+    output$MyPlot2 <- renderPlot({
+        cdrDF %>%
+      group_by(month, education) %>%
+      summarize(countofchurn = sum(as.numeric(churn))) %>%
+      ggplot(aes(x = month, y = countofchurn,
+                 group = education, fill = education)) +
+      geom_bar(stat = "identity", position = position_dodge()) +
+      labs(x = "month", y = "Counts of churn") +
+      theme_minimal()
+    })
+
+    output$MyPlot3 <- renderPlot({
+        data %>%
+      group_by(month, callfailurerate) %>%
+      summarize(countofchurn = sum(as.numeric(churn))) %>%
+      ggplot(aes(x = month, y = countofchurn,
+                 group = factor(callfailurerate), fill = factor(callfailurerate))) +
+      geom_bar(stat = "identity", position = position_dodge()) +
+      labs(x = "month", y = "Counts of churn") +
+      theme_minimal()
+    })
+}
+
@@ -0,0 +1,69 @@
+####################################################################################################
+## Title: Telco Customer Churn
+## Description: Data Preparation 
+## Author: Microsoft
+## Note: Prepare the training and testing data sets by pre-processing and spliting on raw data
+####################################################################################################
+
+dataPreparation <- function(sqlSettings, trainTable, testTable) {
+    sqlConnString <- sqlSettings$connString
+
+    ## Query necessary columns from the call detail record table
+    dataVars <- rxGetVarNames(cdrSQL)
+    dataVars <- dataVars[!dataVars %in% c("year", "month")]
+    dataVars <- paste(dataVars, collapse = ", ")
+    dataQuery <- paste("select", dataVars, "from", inputTable)
+    
+    ## Create sql server data sources
+    inputDataSQL = RxSqlServerData(sqlQuery = dataQuery, 
+                                   connectionString = sqlConnString, 
+                                   colInfo = cdrColInfo)
+    trainDataSQL <- RxSqlServerData(connectionString = sqlConnString,
+                                   table = trainTable,
+                                   colInfo = cdrColInfo)
+    testDataSQL <- RxSqlServerData(connectionString = sqlConnString,
+                                   table = testTable,
+                                   colInfo = cdrColInfo)
+
+    ## Data pre-processing: cleaning and splitting followed by SMOTE
+    rxExec(preProcess, inData = inputDataSQL, outData1 = trainDataSQL, outData2 = testDataSQL)
+}
+
+preProcess <- function(inData, outData1, outData2) {
+    ## Clean missing data 
+    ## Remove duplicate rows
+    cdrDF <- rxDataStep(inData = inData,
+                        removeMissings = TRUE,
+                        overwrite = TRUE)
+    cdrDF <- cdrDF[!duplicated(cdrDF),]
+
+    ## Split data
+    set.seed(1234)
+    splitFile <- rxSplit(inData = cdrDF,
+                         outFilesBase = "trainTestData",
+                         splitByFactor = "ind",
+                         transforms = list(ind = factor(sample(0:1, size = .rxNumRows, replace = TRUE, prob = c(0.3, 0.7)),
+                                                       levels = 0:1,
+                                                       labels = c("Test", "Train"))),
+                         overwrite = TRUE)
+    trainFile <- splitFile[[2]]
+    testFile <- splitFile[[1]]
+    
+    ## SMOTE on training data 
+    trainDF <- rxDataStep(inData = trainFile, varsToDrop = c("ind"))
+    testDF <- rxDataStep(inData = testFile, varsToDrop = c("ind"))
+
+    library(unbalanced)
+    trainVars <- names(trainDF)
+    trainVarsInd <- trainVars %in% c("churn")
+    smotetrain <- ubSMOTE(X = trainDF[!trainVarsInd], Y = trainDF$churn,
+                      perc.over = 200, perc.under = 500,
+                      k = 3, verbose = TRUE)
+    smotetrainDF <- cbind(smotetrain$X, smotetrain$Y)
+    names(smotetrainDF)[names(smotetrainDF) == "smotetrain$Y"] <- "churn"
+    trainDF <- smotetrainDF
+  
+    ## Load final training data and testing data into SQL
+    rxDataStep(inData = trainDF, outFile = outData1, overwrite = TRUE)
+    rxDataStep(inData = testDF, outFile = outData2, overwrite = TRUE)
+}
@@ -0,0 +1,40 @@
+################################################################
+## Title: Telco Customer Churn
+## Description: Defining pre-functions
+## Author: Microsoft
+################################################################
+
+####################################################################################################
+## Define functions for model evaluation
+####################################################################################################
+## Define evaluation metrics
+evaluateModel <- function(data, observed, predicted) 
+{
+  confusion <- table(data[[observed]], data[[predicted]])
+  print(confusion)
+  tp <- confusion[rownames(confusion) == 1, colnames(confusion) == 1]
+  fn <- confusion[rownames(confusion) == 1, colnames(confusion) == 0]
+  fp <- confusion[rownames(confusion) == 0, colnames(confusion) == 1]
+  tn <- confusion[rownames(confusion) == 0, colnames(confusion) == 0]
+  accuracy <- (tp + tn) / (tp + fn + fp + tn)
+  precision <- tp / (tp + fp)
+  recall <- tp / (tp + fn)
+  fscore <- 2 * (precision * recall) / (precision + recall)
+  metrics <- c("Accuracy" = accuracy,
+               "Precision" = precision,
+               "Recall" = recall,
+               "F-Score" = fscore)
+  return(metrics)
+}
+
+## Define ROC curve 
+rxrocCurve <- function(data, observed, predicted) 
+{
+  data <- data[, c(observed, predicted)]
+  data[[observed]] <- as.numeric(as.character(data[[observed]]))
+  rxRocCurve(actualVarName = observed,
+             predVarNames = predicted,
+             data = data)
+}
+
+