-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdata_analysis_adoption.Rmd
147 lines (111 loc) · 3.81 KB
/
data_analysis_adoption.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
---
title: "Analysis of libraries that declare old dependencies"
output: html_notebook
---
### Import required libraries
```{r}
library(tidyverse)
library(latex2exp)
library(scales)
theme_set(theme_bw())
dim(data_versions)
```
### Read the data
```{r}
data <- read_csv("Data/data.csv")
dataUp <- read_csv("Data/areUpToDate_all.csv")
load("Data/data_versions.RData")
```
### Split the data_versions into four groups
```{r}
# libraries with oly one version released
libraries_single_version <- data_versions %>%
filter(NumVersions == 1)
dim(libraries_single_version)[1]
filtered <- dplyr::anti_join(data_versions, libraries_single_version, by = "Library")
# libraries with mored that one version but released the same day
libraries_single_day <- data_versions %>%
filter(MeanDiff == 0 | SDDiff == 0)
dim(libraries_single_day)[1]
filtered <- dplyr::anti_join(filtered, libraries_single_day, by = "Library")
# libraries that have between 1 and 9 versions released
libraries_betw10and200_versions <- filtered %>%
filter(NumVersions >= 10 & NumVersions <= 200)
dim(libraries_betw10and200_versions)[1]
# libraries that have more than 200 versions released
libraries_betw10and200_versions <- filtered %>%
filter(NumVersions > 200)
dim(libraries_betw10and200_versions)[1]
```
```{r}
table(dataUp$hasNewer)
# group by latest version
data_latest<- dataUp %>%
group_by(latest) %>%
summarise(Providers=list(provider), hasNewers = list(hasNewer))
dim(data_latest)[1]
# Calculate the number of true
hasNewers <- data_latest$hasNewers
vecTrue <- c()
vecFalse <- c()
for (i in 1:length(hasNewers)) {
print(i)
numTrue <- 0
numFalse <- 0
for (j in 1:length(hasNewers[[i]])) {
if (hasNewers[[i]][j] == "true") {
numTrue <- numTrue + 1
} else{
numFalse <- numFalse + 1
}
}
vecFalse <- c(vecFalse, numFalse)
vecTrue <- c(vecTrue, numTrue)
}
data_latest$NumTrue <- vecTrue
data_latest$NumFalse <- vecFalse
# add colum if the library has at least one outdated dependency or not
data_latest <- data_latest %>% mutate(hasOutdatedDeps = ifelse(NumTrue > 0, "yes", "no"))
# get ratios
ratioTrue = sum(data_latest$NumTrue) / (sum(data_latest$NumTrue) + sum(data_latest$NumFalse))
ratioFalse = sum(data_latest$NumFalse) / (sum(data_latest$NumTrue) + sum(data_latest$NumFalse))
# data latest
grouped_data <- data_latest
grouped_data$Providers <- as.character(grouped_data$Providers)
grouped_data$hasNewers <- as.character(grouped_data$hasNewers)
write_csv(grouped_data, "data_latest.csv")
# libraries with outdated dependencies
table(data_latest$hasOutdatedDep)
# libraries with no dependencies
data_latest %>% filter(Providers == "none") %>% nrow()
```
### Reading the data
```{r}
dataa <- read_csv("Data/data.csv")
time_to_release <- read.csv("Data/timeToRelease.csv")
versions <- read_csv("Data/upgrades-new.csv")
metrics_all <- read_csv("Data/artifacts_metrics_all.csv")
latest<- read_csv("Data/latest_versions.csv")
uptd <- read_csv("Data/areUpToDate_all.csv")
```
```{r}
library("ggpubr")
libraries_betw10and200_versions %>%
ggplot(aes(MeanDiff, MeanPageranks)) +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
geom_point(alpha = 0.2)
libraries_betw10and200_versions %>%
ggplot(aes(MeanDiff, MeanUsages)) +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
geom_point(alpha = 0.2)
```
```{r}
# correlations between number of dependencies and number of versions
libraries_betw10and200_versions %>%
ggplot(aes(NumVersions, MeanPageranks)) +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
geom_point(alpha = 0.2)
```