Skip to content

Commit 6021862

Browse files
committed
New version of telco customer churn demo
This new version of telco churn demo leverages R, RevoScaleR and MicrosoftRML packages in SQL Server 2016.
1 parent 618e150 commit 6021862

16 files changed

Lines changed: 22397 additions & 0 deletions

samples/features/r-services/Telco Customer Churn v3/Data/edw_cdr.csv

Lines changed: 20469 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"state","latitude","longitude"
2+
AK,61.3850,-152.2683
3+
AL,32.7990,-86.8073
4+
AR,34.9513,-92.3809
5+
AS,14.2417,-170.7197
6+
AZ,33.7712,-111.3877
7+
CA,36.1700,-119.7462
8+
CO,39.0646,-105.3272
9+
CT,41.5834,-72.7622
10+
DC,38.8964,-77.0262
11+
DE,39.3498,-75.5148
12+
FL,27.8333,-81.7170
13+
GA,32.9866,-83.6487
14+
HI,21.1098,-157.5311
15+
IA,42.0046,-93.2140
16+
ID,44.2394,-114.5103
17+
IL,40.3363,-89.0022
18+
IN,39.8647,-86.2604
19+
KS,38.5111,-96.8005
20+
KY,37.6690,-84.6514
21+
LA,31.1801,-91.8749
22+
MA,42.2373,-71.5314
23+
MD,39.0724,-76.7902
24+
ME,44.6074,-69.3977
25+
MI,43.3504,-84.5603
26+
MN,45.7326,-93.9196
27+
MO,38.4623,-92.3020
28+
MP,14.8058,145.5505
29+
MS,32.7673,-89.6812
30+
MT,46.9048,-110.3261
31+
NC,35.6411,-79.8431
32+
ND,47.5362,-99.7930
33+
NE,41.1289,-98.2883
34+
NH,43.4108,-71.5653
35+
NJ,40.3140,-74.5089
36+
NM,34.8375,-106.2371
37+
NV,38.4199,-117.1219
38+
NY,42.1497,-74.9384
39+
OH,40.3736,-82.7755
40+
OK,35.5376,-96.9247
41+
OR,44.5672,-122.1269
42+
PA,40.5773,-77.2640
43+
PR,18.2766,-66.3350
44+
RI,41.6772,-71.5101
45+
SC,33.8191,-80.9066
46+
SD,44.2853,-99.4632
47+
TN,35.7449,-86.7489
48+
TX,31.1060,-97.6475
49+
UT,40.1135,-111.8535
50+
VA,37.7680,-78.2057
51+
VI,18.0001,-64.8199
52+
VT,44.0407,-72.7093
53+
WA,47.3917,-121.5708
54+
WI,44.2563,-89.6385
55+
WV,38.4680,-80.9696
56+
WY,42.7475,-107.2085
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
**Instructions**
2+
3+
4+
- Run the telcoChurn-main.R to drive the R demo
5+
- Run the telcoChurn-modelComparison.R to compare different algorithms that we tried to build churn models
6+
7+
8+
9+
----------
10+
**Description**
11+
12+
- **telcoChurn-setUp.R** - Setting up relevant R packages
13+
- **telcoChurn-evaluate.R** - Defining pre-functions for model evaluation
14+
- **telcoChurn-dataExploration.R** - Creating a Shiny application to explore and visualize the data
15+
- **telcoChurn-dataPreparation.R** - Defining functions to do data pre-processing and spliting in order to generate suitable training and testing data sets
16+
- **telcoChurn-trainModel.R** - Defining a function to train the telco churn model with mxFastTree algorithm
17+
- **telcoChurn-main.R** - Main R file driving the demo execution
18+
- **telcoChurn-modelComparison.R** - R file to build and compare different tree-based classification models, including CRAN R algorithms - randomForest and xgboost, RevoScaleR algorithms – rxDForest and rxBTrees, as well as MicrosoftRML algorithms – mxFasttree and mxFastforest
19+
20+
----------
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
####################################################################################################
2+
## Title: Telco Customer Churn
3+
## Description: Data Exploration and Visualization
4+
## Author: Microsoft
5+
####################################################################################################
6+
7+
library(shiny)
8+
library(leaflet)
9+
library(jsonlite)
10+
library(dplyr)
11+
library(ggplot2)
12+
13+
## Load data from SQL
14+
# cdrDF <- rxImport(inData = cdrSQL)
15+
16+
## Load data from local
17+
cdrFile <- file.path(wd, "Data", "edw_cdr.csv")
18+
cdrDF <- read.csv(file = cdrFile, header = TRUE, sep = ",")
19+
20+
latlonFile <- file.path(wd, "Data", "state_latlon.csv")
21+
latlonDF <- read.csv(file = latlonFile, header = TRUE, sep = ",")
22+
23+
data <- cdrDF %>%
24+
group_by(state) %>%
25+
summarise(complaintsbystate = sum(as.numeric(numberofcomplaints)),
26+
churnbystate = sum(as.numeric(churn))) %>%
27+
mutate(lab = paste0("<center>", "state,", state, ": ", "<br>",
28+
"complaintsbystate,", complaintsbystate, "<br>",
29+
"churnbystate,", churnbystate, "</center>")) %>%
30+
left_join(cdrDF, by = "state") %>%
31+
left_join(latlonDF, by = "state")
32+
33+
ui <- fluidPage(
34+
tags$style(HTML("
35+
@import url('https://fonts.googleapis.com/css?family=Poppins');
36+
37+
body {
38+
39+
font-family: 'Poppins', 'Lucida Grande', Verdana, Lucida, Helvetica, Arial, Calibri, sans-serif;
40+
color: rgb(0,0,0);
41+
background-color: #d2d2d2;
42+
}
43+
")),
44+
45+
titlePanel("Telco Customer Churn"),
46+
47+
# Sidebar with a slider input for number of bins
48+
sidebarLayout(
49+
sidebarPanel(
50+
sliderInput("sc", "Scale size of circles (also redraws map to show only the last added state)",
51+
min = 0.5, max = 5, value = 1, step = 0.1),
52+
p(),
53+
selectInput("state", "Select a state to add to the map",
54+
choices = c("", data$state), selected = "",
55+
size = , selectize = FALSE),
56+
actionButton("clear1", "Clear all states"),
57+
p(),
58+
p("Proportion of customer churn"),
59+
plotOutput("MyPlot1", height = "200px"),
60+
p(),
61+
p("Impact of education on churn"),
62+
plotOutput("MyPlot2", height = "200px"),
63+
p(),
64+
p("Impact of call failure rate on churn"),
65+
plotOutput("MyPlot3", height = "200px"),
66+
h2("About"),
67+
HTML("<p>Created with R and Shiny leaflet. R users can download the
68+
cleaned and tidy call detail record data from <a href = 'https://github.com/Microsoft/sql-server-samples/tree/master/samples/features/r-services/Telco%20Customer%20Churn'>
69+
https://github.com/Microsoft/sql-server-samples/tree/master/samples/features/r-services/Telco%20Customer%20Churn</a>.
70+
The latitute and longitute for each USA state can be found from <a href = 'http://dev.maxmind.com/geoip/legacy/codes/state_latlon/'>
71+
http://dev.maxmind.com/geoip/legacy/codes/state_latlon/</a>.")
72+
),
73+
74+
75+
mainPanel(
76+
leafletOutput("MyMap", height = 1000)
77+
78+
)
79+
)
80+
)
81+
82+
server <- function(input, output, session) {
83+
84+
the_data_state <- reactive({
85+
tmp <- data %>%
86+
filter(state == input$state)
87+
88+
if (input$state != "") {
89+
thecol <- data.frame(data)[data$state == input$state, "colour"]
90+
} else {
91+
tmp <- data[1,]
92+
thecol <- NULL
93+
94+
}
95+
96+
return(list(df = tmp, thecol = thecol))
97+
})
98+
99+
output$MyMap <- renderLeaflet({
100+
leaflet() %>%
101+
addProviderTiles("Stamen.Watercolor") %>%
102+
addProviderTiles("Stamen.TonerLabels") %>%
103+
fitBounds(-120, 30, -60, 50)
104+
})
105+
106+
observe({
107+
leafletProxy("MyMap", data = the_data_state()$df) %>%
108+
addCircleMarkers( ~ longitude,
109+
~ latitude,
110+
color = the_data_state()$thecol,
111+
radius = ~churnbystate * 0.1 * input$sc,
112+
popup = ~lab)
113+
})
114+
115+
observe({
116+
x <- input$clear1
117+
updateSelectInput(session, "state", selected = "")
118+
leafletProxy("MyMap") %>% clearMarkers()
119+
})
120+
121+
observe({
122+
x <- input$sc
123+
leafletProxy("MyMap") %>% clearMarkers()
124+
})
125+
126+
127+
output$MyPlot1 <- renderPlot({
128+
cdrDF %>%
129+
ggplot(aes(x = factor(1), fill = factor(churn))) +
130+
geom_bar(width = 1) +
131+
coord_polar(theta = "y") +
132+
theme_minimal()
133+
})
134+
135+
output$MyPlot2 <- renderPlot({
136+
cdrDF %>%
137+
group_by(month, education) %>%
138+
summarize(countofchurn = sum(as.numeric(churn))) %>%
139+
ggplot(aes(x = month, y = countofchurn,
140+
group = education, fill = education)) +
141+
geom_bar(stat = "identity", position = position_dodge()) +
142+
labs(x = "month", y = "Counts of churn") +
143+
theme_minimal()
144+
})
145+
146+
output$MyPlot3 <- renderPlot({
147+
data %>%
148+
group_by(month, callfailurerate) %>%
149+
summarize(countofchurn = sum(as.numeric(churn))) %>%
150+
ggplot(aes(x = month, y = countofchurn,
151+
group = factor(callfailurerate), fill = factor(callfailurerate))) +
152+
geom_bar(stat = "identity", position = position_dodge()) +
153+
labs(x = "month", y = "Counts of churn") +
154+
theme_minimal()
155+
})
156+
}
157+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
####################################################################################################
2+
## Title: Telco Customer Churn
3+
## Description: Data Preparation
4+
## Author: Microsoft
5+
## Note: Prepare the training and testing data sets by pre-processing and spliting on raw data
6+
####################################################################################################
7+
8+
dataPreparation <- function(sqlSettings, trainTable, testTable) {
9+
sqlConnString <- sqlSettings$connString
10+
11+
## Query necessary columns from the call detail record table
12+
dataVars <- rxGetVarNames(cdrSQL)
13+
dataVars <- dataVars[!dataVars %in% c("year", "month")]
14+
dataVars <- paste(dataVars, collapse = ", ")
15+
dataQuery <- paste("select", dataVars, "from", inputTable)
16+
17+
## Create sql server data sources
18+
inputDataSQL = RxSqlServerData(sqlQuery = dataQuery,
19+
connectionString = sqlConnString,
20+
colInfo = cdrColInfo)
21+
trainDataSQL <- RxSqlServerData(connectionString = sqlConnString,
22+
table = trainTable,
23+
colInfo = cdrColInfo)
24+
testDataSQL <- RxSqlServerData(connectionString = sqlConnString,
25+
table = testTable,
26+
colInfo = cdrColInfo)
27+
28+
## Data pre-processing: cleaning and splitting followed by SMOTE
29+
rxExec(preProcess, inData = inputDataSQL, outData1 = trainDataSQL, outData2 = testDataSQL)
30+
}
31+
32+
preProcess <- function(inData, outData1, outData2) {
33+
## Clean missing data
34+
## Remove duplicate rows
35+
cdrDF <- rxDataStep(inData = inData,
36+
removeMissings = TRUE,
37+
overwrite = TRUE)
38+
cdrDF <- cdrDF[!duplicated(cdrDF),]
39+
40+
## Split data
41+
set.seed(1234)
42+
splitFile <- rxSplit(inData = cdrDF,
43+
outFilesBase = "trainTestData",
44+
splitByFactor = "ind",
45+
transforms = list(ind = factor(sample(0:1, size = .rxNumRows, replace = TRUE, prob = c(0.3, 0.7)),
46+
levels = 0:1,
47+
labels = c("Test", "Train"))),
48+
overwrite = TRUE)
49+
trainFile <- splitFile[[2]]
50+
testFile <- splitFile[[1]]
51+
52+
## SMOTE on training data
53+
trainDF <- rxDataStep(inData = trainFile, varsToDrop = c("ind"))
54+
testDF <- rxDataStep(inData = testFile, varsToDrop = c("ind"))
55+
56+
library(unbalanced)
57+
trainVars <- names(trainDF)
58+
trainVarsInd <- trainVars %in% c("churn")
59+
smotetrain <- ubSMOTE(X = trainDF[!trainVarsInd], Y = trainDF$churn,
60+
perc.over = 200, perc.under = 500,
61+
k = 3, verbose = TRUE)
62+
smotetrainDF <- cbind(smotetrain$X, smotetrain$Y)
63+
names(smotetrainDF)[names(smotetrainDF) == "smotetrain$Y"] <- "churn"
64+
trainDF <- smotetrainDF
65+
66+
## Load final training data and testing data into SQL
67+
rxDataStep(inData = trainDF, outFile = outData1, overwrite = TRUE)
68+
rxDataStep(inData = testDF, outFile = outData2, overwrite = TRUE)
69+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
################################################################
2+
## Title: Telco Customer Churn
3+
## Description: Defining pre-functions
4+
## Author: Microsoft
5+
################################################################
6+
7+
####################################################################################################
8+
## Define functions for model evaluation
9+
####################################################################################################
10+
## Define evaluation metrics
11+
evaluateModel <- function(data, observed, predicted)
12+
{
13+
confusion <- table(data[[observed]], data[[predicted]])
14+
print(confusion)
15+
tp <- confusion[rownames(confusion) == 1, colnames(confusion) == 1]
16+
fn <- confusion[rownames(confusion) == 1, colnames(confusion) == 0]
17+
fp <- confusion[rownames(confusion) == 0, colnames(confusion) == 1]
18+
tn <- confusion[rownames(confusion) == 0, colnames(confusion) == 0]
19+
accuracy <- (tp + tn) / (tp + fn + fp + tn)
20+
precision <- tp / (tp + fp)
21+
recall <- tp / (tp + fn)
22+
fscore <- 2 * (precision * recall) / (precision + recall)
23+
metrics <- c("Accuracy" = accuracy,
24+
"Precision" = precision,
25+
"Recall" = recall,
26+
"F-Score" = fscore)
27+
return(metrics)
28+
}
29+
30+
## Define ROC curve
31+
rxrocCurve <- function(data, observed, predicted)
32+
{
33+
data <- data[, c(observed, predicted)]
34+
data[[observed]] <- as.numeric(as.character(data[[observed]]))
35+
rxRocCurve(actualVarName = observed,
36+
predVarNames = predicted,
37+
data = data)
38+
}
39+
40+

0 commit comments

Comments
 (0)