### import the dataset particulate.csv into R
www = "http://www.biostatisticaumg.it/dataset/particulate.csv"
particulate = read.csv(www, header = TRUE, sep = ";")
attach(particulate)
names(particulate)


### the Italian peninsula divided into two geographical areas, the Northern and Southern 
### part along the Tuscan-Emilian Apennines watershed
### [unpublished figure]
### position of italian Provinces 
### according to factor 'Where' (levels: north and south)
ormea = c(7.77, 44.1)
fraconalto = c(8.88, 44.58)
sanmarino =c(12.45, 43.94)
plot(Long, Lat, col = "white")
points(ormea[1], ormea[2], col = "red")
points(fraconalto[1], fraconalto[2], col = "red")
confineLon = c(ormea[1], fraconalto[1], sanmarino[1])
confineLat = c(ormea[2], fraconalto[2], sanmarino[2])
points(Long[Where == "north"], Lat[Where == "north"], bg = "violet", pch = 23)
points(Long[Where == "south"], Lat[Where == "south"], bg = "orange", pch = 21)
lines(confineLon, confineLat, col = "red", lwd = 3)
points(sanmarino[1], sanmarino[2], col = "red")


### To calculate the ratio of PM10 Exceedances over the StationsNum removing null denominator
ExcedRatioPM10 = Exceedances / StationsNum
ExcedRatioPM10[ExcedRatioPM10 == "NaN"] = 0
summary(ExcedRatioPM10)


### To define the primary outcome of the study, 
### as in Crawley, Chapter 14 'Proportion data' 
### https://onlinelibrary.wiley.com/doi/book/10.1002/9781119941750
response = cbind(Cases, Population - Cases)


### RESULTS 
### In the univariate analysis, the PM10 daily limit value exceedances appear to be a 
### significant predictor (p < .001) of infection (Fig. 3b) with a 1.29 cut-off value. 
### The cut-off divides the Provinces into two classes, respectively with higher (n = 43) 
### and lower (n = 67) PM10 concentrations. 
### [unpublished]
### the 1.29 cut-off value comes from exploratory analysis:
library(party)
preliminaryexploration = Cases ~ ExcedRatioPM10 + Density
plot(ctree(preliminaryexploration))
exceedPM10 = factor(ExcedRatioPM10 > 1.285714)
levels(exceedPM10)[2] = "higherPM10"
levels(exceedPM10)[1] = "lowerPM10"
preliminaryexploration2 = Cases ~ exceedPM10 + Density
plot(ctree(preliminaryexploration2))
### [unpublished]
### boxplot appears 'long tailed', log tansform may help in seeing


### figure previous3b
CasesProp = 1000 * Cases/Population
LogCasesProp = log(1+ CasesProp)
startrelation = LogCasesProp ~ exceedPM10 
Figure3B = ctree(startrelation)
plot(Figure3B)
### [unpublished figure] 
### same without log transform 
### plot(ctree(CasesProp ~ exceedPM10))


### the less polluted Provinces had a median 0.03 infection case over 1000 residents 
### (first - third quartile 0.01 - 0.09, range 0.00 - 0.56), while most polluted Provinces
### had a median 0.26 infection cases over 1000 Province residents 
### (first - third quartile 0.14 - 0.51, range 0.00 - 4.92).
summary(CasesProp[exceedPM10 == "lowerPM10"])
summary(CasesProp[exceedPM10 == "higherPM10"])


### Dividing the Italian peninsula into two areas, the Northern and Southern part along
### the Tuscan-Emilian Apennines watershed, the exceedPM10 results as follows: 
### 39 of the 41 Northern Provinces falls in the higher PM10 category, while on the 
### Southern Provinces the ratio is reversed: 62 over 66 have lower PM10 
### (odds ratio .00, Fisher exact test p < .001).
table(exceedPM10, Where)
fisher.test(table(exceedPM10, Where))



### Also the proportion of commuters over the Province population has a significant 
### (p = 0.01, not depicted) role in predicting the infection rates according to the 
### univariate binary partitioning tree analysis: after setting a cut-off of 47% ... 
withoutPM = LogCasesProp ~ Population + Density + Commuters + CommutersDensity
plot(ctree(withoutPM))  ## important: Density is not anymore a predictor


### ... people daily moving in Provinces, in the Provinces 
### with a lower number of commuters (n = 51) ... 
sum(CommutersDensity <= 47)


### ... the median infection case over 1000 Province residents is 0.03 
### (first – third quartile 0.01 – 0.05, range 0.00 – 0.33), while in the other Provinces
### the median infection cases over 1000 residents is 0.18 
### (first – third quartile 0.13 – 0.36, range 0.00 – 4.92).
summary(CasesProp[CommutersDensity <= 47])
summary(CasesProp[CommutersDensity > 47])




### Notably, when performing a bivariate conditional regression exploratory analysis 
### joining both the pollution and the proportion of commuters as possible predictors 
### of the infection rates, one obtains exactly the same tree depicted in Fig. 3b: 
### the commuters proportion looses its effect, suggesting a strong correlation of 
### air quality to the COVID-19 cases percentages breakout.
bivariate = LogCasesProp ~ exceedPM10 + CommutersDensity
plot(ctree(bivariate))  
### same as: 
### plot(Figure3B)


### The logistic regression depicted in Fig. 3A (semi-logarithmic scales)
### confirms the exploratory analysis:
plot(ExcedRatioPM10, Cases/Population, log = "y", col = "white",  pch = "." ,
		xlab = "Average daily PM10 exceedances", ylab = "Cases / Population")
minimaladequate = glm(response ~ ExcedRatioPM10, family = quasibinomial)
xx = seq(from = 0.1, to = 12, by = 0.01)
ip = minimaladequate$coef[[1]] + minimaladequate$coef[[2]] * xx 
yy = exp(ip)/(1+exp(ip))
lines(xx, yy, lty = 2, col = "darkgray", lwd = 3)
points(ExcedRatioPM10[Where == "south"], Cases[Where == "south"]/Population[Where == "south"],
		col = "black", bg = "black",  pch = 21)
points(ExcedRatioPM10[Where == "north"], Cases[Where == "north"]/Population[Where == "north"],
		col = "black", bg = "lightgray",  pch = 22)


### ... a binomial distributed generalized linear model, corrected for overdispersion, 
### reveals an increasing slope of 0.25 (s.e. 0.04, p < 0.001) of the linear predictor. 
summary(minimaladequate)$coef


### after Reviewers revision, May/June 2020
### Correlation between Province PM2.5 and PM10 exceedance rates has been assessed
linearmodel = lm(ExcedRatio25 ~ ExcedRatioPM10)
plot(ExcedRatioPM10, ExcedRatio25)
abline(linearmodel)
summary(linearmodel)
cor.test(ExcedRatioPM10, ExcedRatio25, na.rm = TRUE)


### [unpublished]
### diagnostic plots indicates mild lack of normality into residuals
### possible explanation: two different clusters
par(mfrow = c(2,2))
plot(linearmodel)


### Repeating exploratory analysis on PM2.5 exceedance rates by 
### recursive partitioning tree analysis, an 11 cutoff was detected ...
monovariatePM25 = LogCasesProp ~  ExcedRatio25 
plot(ctree(monovariatePM25))  
excedPM2.5 = factor(ExcedRatio25 > 11)
levels(excedPM2.5)[2] = "higherPM2.5"
levels(excedPM2.5)[1] = "lowerPM2.5"


### ...  and an improved tree was disclosed, as depicted in Figure 3C.
Relation3C = LogCasesProp ~ exceedPM10 + excedPM2.5 + CommutersDensity
Figure3C = ctree(Relation3C)
plot(Figure3C)


###  Despite the limitation of not having 43 over 110 complete data on PM2.5 exceedances 
### (39% missing values) ... 
summary(excedPM2.5)
length(excedPM2.5)
summary(excedPM2.5)/length(excedPM2.5)
threelevelsclass = factor(as.numeric(exceedPM10) * as.numeric(excedPM2.5))
levels(threelevelsclass)[1] = "lowbothPM"
levels(threelevelsclass)[2] = "highPM10lowPM2.5"
levels(threelevelsclass)[3] = "highbothPM"


### it appears that, coherently with main study outcome, 
### strong PM2.5 exceedance rates are eminently located in Po Valley. 
### [unpublished Figure]
ormea = c(7.77, 44.1)
fraconalto = c(8.88, 44.58)
sanmarino =c(12.45, 43.94)
plot(Long, Lat, col = "white")
points(ormea[1], ormea[2], col = "purple")
points(fraconalto[1], fraconalto[2], col = "purple")
points(sanmarino[1], sanmarino[2], col = "purple")
confineLon = c(ormea[1], fraconalto[1], sanmarino[1])
confineLat = c(ormea[2], fraconalto[2], sanmarino[2])
lines(confineLon, confineLat, col = "purple", lwd = 3)
points(Long[threelevelsclass == "lowbothPM"], 
		Lat[threelevelsclass == "lowbothPM"], bg = "green", pch = 23)
points(Long[threelevelsclass == "highPM10lowPM2.5"], 
		Lat[threelevelsclass == "highPM10lowPM2.5"], bg = "yellow", pch = 23)
points(Long[threelevelsclass == "highbothPM"], 
		Lat[threelevelsclass == "highbothPM"], bg = "red", pch = 23)


### The following table depicts in detail the situation:
table(threelevelsclass, Where)