---
title: "bixplot_examples"
author: "Rousseeuw, P.J."
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{bixplot_examples}
  %\VignetteEncoding{UTF-8}
  %\VignetteEngine{knitr::rmarkdown}
editor_options: 
  markdown: 
    wrap: 72
---

```{r, echo = FALSE}
knitr::opts_chunk$set(
  fig.width  = 7,
  fig.height = 4,
  fig.align  = 'center'
)
oldpar <- list(mar = par()$mar, mfrow = par()$mfrow)
```

# Introduction

This vignette illustrates the `bixplot` function, which visualizes a
univariate dataset by combining an estimated density body, a
box-and-whisker summary, and a rug of individual data values. For
variables with more than one mode, the function automatically detects
clusters and displays each mode separately. The examples below start
with simple univariate comparisons and continue with more elaborate
multivariable layouts and rug coloring options.

```{r}
library(vioplot)
library(robustHD)
library(classmap)
```

------------------------------------------------------------------------

# Unimodal, bimodal and multimodal data

We start with three simulated variables: one unimodal, one bimodal and
one trimodal. This illustrates the core difference between a violin plot
and a bixplot.

```{r}
set.seed(1)
dat1 <- rnorm(120, 0, 2.5)
dat2 <- c(rnorm(80, -3, 1), rnorm(40, 3, 1))
dat3 <- c(rnorm(25, -4, 0.8), rnorm(50, 0, 0.8), rnorm(40, 4, 0.8))
xlist <- list(Unimodal = dat1, Bimodal = dat2, Multimodal = dat3)
```

The default call already gives an informative result:

```{r}
bixout <- bixplot(xlist, main = "bixplot")
```

The returned list `bixout` contains the cluster structure and
five-number summaries for each variable:

```{r}
bixout
```

For comparison, below we display the violin plot and the bixplot side by
side. The bixplot separates the modes of the bimodal and trimodal
variables and draws a dedicated box for each cluster.

```{r, fig.width = 8, fig.height = 4}
ylim <- c(-8.5, 9)
par(las = 1, mfrow = c(1, 2))
par(mar = c(2.1, 2.2, 1.7, 2))
viocol <- adjustcolor("chocolate3", alpha.f = 0.5)
vioplot::vioplot(xlist, ylim = ylim, main = "", col = viocol)
title(main = "violin plot", line = 0.5, cex.main = 1)
par(mar = c(2.1, 2.2, 1.7, 0.2))
bixplot(xlist, ylim = ylim, main = "")
title(main = "bixplot", line = 0.5, cex.main = 1)
par(oldpar)
```

------------------------------------------------------------------------

# Latency data and penguin bill length

Next we apply `bixplot` to two real datasets: a latency dataset with
three variables and the penguin bill length variable grouped by island.

```{r, fig.width = 10, fig.height = 7}
data("data_latenc")
dim(data_latenc) # 40 rows, 3 columns

par(las = 1, mfrow = c(2, 2))
mar1 <- c(2.1, 2.4, 2.3, 2)
mar2 <- c(2.1, 2.2, 2.3, 0.2)
viocol <- adjustcolor("chocolate3", alpha.f = 0.5)

par(mar = mar1)
vioplot::vioplot(data_latenc, main = "", col = viocol)
title(main = "violin plot", cex.main = 1.2, line = 0.6)

par(mar = mar2)
mymodeCol <- c("cadetblue3", "hotpink2")
bixplot(data_latenc, main = "", cutmin = 0, cutmax = 300,
        ylim = c(0, 300), modeCol = mymodeCol)
title(main = "bixplot", cex.main = 1.2, line = 0.6)

# Score the islands in a meaningful order
islandscore <- rep(NA, length(penguins$island))
islandscore[penguins$island == "Torgersen"] <- 1
islandscore[penguins$island == "Biscoe"]    <- 2
islandscore[penguins$island == "Dream"]     <- 3
ylim <- c(29, 62)

par(mar = mar1)
vioplot::vioplot(bill_len ~ reorder(island, islandscore, mean),
                 data = penguins, main = "",
                 col = viocol, xlab = "", ylab = "",
                 ylim = ylim, cex.axis = 1)
title(main = "violin plot", cex.main = 1.2, line = 0.6)

par(mar = mar2)
bixplot(bill_len ~ reorder(island, islandscore, mean),
        data = penguins, main = "", ylim = ylim,
        bodyCol = "gray40", bodyOpaque = 0.3)
title(main = "bixplot", cex.main = 1.2, line = 0.6)
par(oldpar)
```

For the latency variables, the bixplot correctly identifies a bimodal
structure that is obscured by the smooth envelope of the violin plot.
For the penguin bill lengths, the Biscoe island group is clearly
bimodal, reflecting the presence of two penguin species.

------------------------------------------------------------------------

# Iris data

The standardized iris dataset has four variables. The bixplot reveals
that petal length and petal width have more than one mode (cluster).

```{r, fig.width = 10, fig.height = 5}
par(mfrow = c(1, 2))
par(mar = c(4, 2, 2, 0.1))

diris <- data.frame(scale(iris[, 1:4]))
colnames(diris) <- c("Sepal.L", "Sepal.W", "Petal.L", "Petal.W")

mymodeCol <- c("cadetblue3", "hotpink2",
               "cadetblue3", "hotpink2",
               "lawngreen")
bixplot(diris, main = "", cut = 3, col = "gray75",
        bodyOpaque = 0.6, rugW = 0.16,
        rugoutCol = "red", curveLwd = 0.5,
        modeCol = mymodeCol,
        ylim = c(-3.6, 4.2), yaxs = "i",
        xlab = "standardized variables")
title(main = "bixplot display of iris data", cex.main = 1, line = 0.7)
```

We can also overlay marginal bixplots on top of a scatter plot, to show
the marginal distributions of the axes variables. Here we add horizontal
and vertical half-bixplots to a scatter plot of petal length versus
petal width.

```{r, fig.width = 5, fig.height = 5}
par(mar = c(4, 4, 2, 0.2))
x <- diris$Petal.L
y <- diris$Petal.W
xlim <- c(-2.3, 2.4)
ylim <- c(-2.2, 2.4)
xyratio <- (xlim[2] - xlim[1]) / (ylim[2] - ylim[1])

plot(x, y, xlim = xlim, ylim = ylim, pch = 16,
     xlab = "", ylab = "", xaxs = "i", yaxs = "i")
title(xlab = "Petal.L", line = 2)
title(ylab = "Petal.W", line = 2)
title(main = "petal length versus petal width", cex.main = 1, line = 0.7)

bixplot(x, add = TRUE, horizontal = TRUE,
        at = ylim[1] + 0.015, cutmin = xlim[1],
        boxwex = 0.9, curveLwd = 0.5,
        border = "black", side = "second",
        bodyOpaque = 0.6)
bixplot(y, add = TRUE, horizontal = FALSE,
        at = xlim[1] + 0.015, cutmin = ylim[1],
        boxwex = xyratio * 0.9,
        modeCol = c("cadetblue3", "hotpink2", "lawngreen"),
        curveLwd = 0.5, side = "second",
        bodyOpaque = 0.6)
par(oldpar)
```

The argument `xyratio` scales the vertical bixplot so that its density
body has the same visual height as the horizontal one.

------------------------------------------------------------------------

# Body sizing options for multimodal variables

The `bodysize` argument controls how the density bodies of individual
modes within a variable are sized relative to each other. The three
options are illustrated below on the (bimodal) petal length variable.

```{r, fig.width = 6, fig.height = 2.4}
par(mfrow = c(1, 3))
par(mar = c(2.5, 2, 2, 1))

for (bs in c("width_is_constant", "area_is_constant", "area_from_count")) {
  bixplot(diris[, 3], main = "", cut = 3, col = "gray75",
          bodyOpaque = 0.5, bodysize = bs, curveLwd = 0.5,
          ylim = c(-2.2, 2.4), yaxs = "i", names = "Petal.L")
  title(main = switch(bs,
                      width_is_constant = "equal width",
                      area_is_constant  = "equal area",
                      area_from_count   = "area from count"),
        cex.main = 1, line = 0.7)
}
par(oldpar)
```

With `"area_from_count"` (the default), the larger cluster receives a
proportionally higher area, making cluster sizes immediately apparent.

------------------------------------------------------------------------

# Bill length by island and sex

Here we use the formula interface to plot bill length split by the
interaction of sex and island. We compare a standard multi-group bixplot
with the `side = "both"` layout, which pairs male and female
distributions on opposing sides of a shared axis.

```{r, fig.width = 12, fig.height = 6}
par(las = 1, mfrow = c(1, 2))

islscore <- rep(NA, length(penguins$island))
islscore[penguins$island == "Torgersen"] <- 1
islscore[penguins$island == "Biscoe"]    <- 2
islscore[penguins$island == "Dream"]     <- 3
ylim    <- c(27, 64)
mynames <- c("Torger.F", "Torger.M", "Biscoe.F",
             "Biscoe.M", "Dream.F",  "Dream.M")
mycol      <- c("slateblue2", "orange")
mymodeCol  <- c("darkorchid1", "slateblue3",
                "goldenrod1",  "darkorange2")

par(mar = c(2.9, 2, 0.8, 1))
bixplot(bill_len ~ sex + reorder(island, islscore, mean),
        data = penguins, names = mynames, modeCol = mymodeCol,
        bodyOpaque = 0.6, ylim = ylim, bodyW = 0.9,
        col = mycol, main = "", rugCol = "black", las = 1)
legend("topleft", legend = c("female", "male"),
       fill = c("slateblue3", "orange"), cex = 1.5)

par(mar = c(2.9, 3, 0.8, 0.1))
bixplot(bill_len ~ sex + reorder(island, islscore, mean),
        data = penguins, main = "", rugCol = "black",
        bodyOpaque = 0.6, ylim = ylim, bodyW = 0.9,
        col = mycol, stickCol = "gray10", stickLwd = 1,
        side = "both", modeCol = mymodeCol, las = 1,
        names = c("Torgersen", "Biscoe", "Dream"))
legend("topleft", legend = c("female", "male"),
       fill = mycol, cex = 1.5)
par(oldpar)
```

With `side = "both"` the number of axis labels is halved and the
male/female pair for each island shares one axis, making comparisons
within an island more direct. Here, male penguins tend to have larger
bill lengths than females.

------------------------------------------------------------------------

# Rug colored by an external numeric variable

The `rugNumeric` argument colors each rug tick mark according to a
continuous covariate via a smooth color palette. Here the rug is colored
by body mass, and a color bar legend is added automatically.

```{r, fig.width = 5, fig.height = 5}
par(las = 1, mfrow = c(1, 1))
par(mar = c(3.2, 3.2, 2, 2.6))

scpenguins <- scale(penguins[, c(4, 3, 5)])
varnames   <- c("bill_depth", "bill_length", "flipper_length")

bixplot(scpenguins, rugNumeric = penguins$body_mass,
        main = "", boxW = 0.30, rugW = 0.24,
        names = varnames, boxLwd = 1.2, boxOpaque = 1,
        curveLwd = 2, colorbarW = 0.16, cex.colorbar = 0.8,
        bodysize = "width_is_constant",
        bodyCol = "grey90", modeCol = "grey90",
        xlab = "feature", ylab = "standardized value")
title(main = "penguins with rug color by body_mass       ",
      line = 0.6)
par(oldpar)
```

We see that heavier penguins (green) tend to have a larger flipper
length, revealing a positive association between these two variables.

------------------------------------------------------------------------

# Rug colored by a factor variable

The `rugFactor` argument colors each rug tick mark by a factor level.
Below we display the same three penguin measurements with rug color
indicating species, this time using a horizontal layout.

```{r, fig.width = 5, fig.height = 5}
par(mfrow = c(1, 1))
par(mar = c(3.2, 3.8, 2, 0.2))

rugFactorColors <- c("red", "blue", "forestgreen")

bixplot(scpenguins, rugFactor = penguins$species,
        main = "", boxW = 0.30, rugW = 0.24,
        ylim = c(-3, 4), names = varnames,
        boxLwd = 1.2, boxOpaque = 1, curveLwd = 2,
        horizontal = TRUE, bodysize = "width_is_constant",
        bodyCol = "grey90", modeCol = "grey90",
        xlab = "standardized value", ylab = "feature",
        rugFactorColors = rugFactorColors, las = 0)
title(main = "penguins with rug color by species", line = 0.6)
legend(x = 2.36, y = 2.8,
       legend = c("Adelie", "Chinstrap", "Gentoo"),
       fill = rugFactorColors, cex = 0.88)
par(oldpar)
```

The species labels in the rug indicate that Gentoo penguins tend to have
smaller `bill_depth` and longer `flipper_length`.

------------------------------------------------------------------------

# Top Gear car data

We now illustrate `bixplot` on the Top Gear dataset from the `robustHD`
package, which contains performance and price data for about three
hundred cars.

```{r}
data(TopGear, package = "robustHD")
scars        <- TopGear[, c(14, 12, 5, 7, 8)]
scars[, 3]   <- log(scars[, 3])
colnames(scars)[3] <- "log(Price)"
scars[, 1:4] <- scale(scars[, 1:4])
```

An initial bixplot reveals a severe outlier in the Weight variable:

```{r}
bixplot(scars[, 1:4])
```

```{r}
which.min(TopGear$Weight) # row 199
TopGear[199, c(1, 2, 14, 12, 5, 7)]
```

The Peugeot 107 has a recorded weight of 210, which is clearly a data
error. We set it to `NA` and replot:

```{r}
scars[199, 1] <- NA
bixplot(scars[, 1:4])
```

There is also an outlier in TopSpeed: the value 50 of the Renault Twizy,
that is a tiny city car. This value is correct, so it is retained.

```{r}
which.min(TopGear$TopSpeed) # row 220
TopGear[220, c(1, 2, 14, 12, 5, 7)]
```

We now produce a final two-panel figure: a bixplot of the four
standardized variables alongside a scatter plot of TopSpeed versus
Displacement with marginal bixplots added on each axis.

```{r, fig.width = 11, fig.height = 5.5}
par(las = 1, mfrow = c(1, 2))
par(mar = c(4, 2, 2, 0.1))

bixplot(scars[, 1:4], ylim = c(-3.8, 5.2), main = "",
        col = "darkgoldenrod2", yaxs = "i",
        modeCol = c("cadetblue3", "hotpink2"),
        bodyOpaque = 0.6, las = 1)
title(main = "standardized Top Gear variables",
      cex.main = 1, line = 0.7)

par(mar = c(4, 4, 2, 0.1))
x <- jitter(scars[, 4])
y <- jitter(scars[, 2])
mycol <- rep(NA, nrow(scars))
mycol[scars[, 5] == "Front"] <- "red"
mycol[scars[, 5] == "Rear"]  <- "forestgreen"
mycol[scars[, 5] == "4WD"]   <- "orange"

plot(x, y, xlim = c(-1.999, 3.999), ylim = c(-2.999, 4.5),
     xaxs = "i", yaxs = "i", xlab = "", ylab = "",
     pch = 16, cex = 1, col = mycol, las = 1)
title(xlab = "Displacement", line = 2)
title(ylab = "TopSpeed",     line = 2)
title(main = "TopSpeed versus Displacement",
      cex.main = 1, line = 0.7)

bixplot(scars[, 4], add = TRUE, at = -2.98, horizontal = TRUE,
        side = "second", boxwex = 2, bodyOpaque = 0.6)
bixplot(scars[, 2], add = TRUE, at = -1.985, horizontal = FALSE,
        side = "second", boxwex = 1.4, bodyOpaque = 0.6)

legend(x = -1.5, y = 4, title = "DriveWheel",
       legend = c("Front", "Rear", "4WD"),
       fill = c("red", "forestgreen", "orange"), cex = 1)
par(oldpar)
```

Rear-wheel-drive cars (green) span the full range of displacement and
are concentrated at the higher end of `TopSpeed`, whereas
front-wheel-drive cars (red) have rather low values of both `TopSpeed`
and `Displacement`.

------------------------------------------------------------------------

# Tooth growth and iris data with `side = "both"`

The `side = "both"` option is particularly useful for two-group
comparisons, as it places paired distributions on opposite sides of a
shared axis.

```{r, fig.width = 9.2, fig.height = 4.6}
par(las = 0, mfrow = c(1, 2))
par(mar = c(3.5, 2, 2, 1))

bixplot(len ~ supp * dose,
        data = ToothGrowth, side = "both",
        col = c("orange", "slateblue3"),
        main = "", xlab = "Vitamin C dose (mg)",
        ylab = "tooth length", bodyOpaque = 0.7,
        ylim = c(-1, 44), yaxs = "i", las = 0,
        stickCol = "black", stickLwd = 1)
title(main = "guinea pigs' tooth growth by supplement type",
      cex.main = 1, line = 0.7)
legend("topleft", title = "supplement",
       legend = c("OJ", "AA"),
       fill = c("orange", "slateblue3"), cex = 1)

par(mar = c(3.5, 2.8, 2, 0.1))
diris <- data.frame(scale(iris[, 1:4]))
colnames(diris) <- c("Sepal.L", "Sepal.W", "Petal.L", "Petal.W")

bixplot(diris, side = "both", main = "",
        bodysize = "width_is_constant",
        col = c("orange", "slateblue3"),
        modeCol = c("cadetblue3", "hotpink2",
                    "lawngreen", "gray60", "cyan3"),
        stickCol = "red", stickLwd = 1,
        bodyOpaque = 0.7, las = 0,
        xlab = "standardized measurements")
title(main = "iris data by length and width",
      cex.main = 1, line = 0.7)
legend("top", legend = c("length", "width"),
       fill = c("orange", "slateblue3"), cex = 1)
par(oldpar)
```

In the tooth growth data we see that the orange juice (OJ) supplement
yields higher tooth growth than ascorbic acid (AA), except at the
highest dose, where the medians of OJ and AA align and AA has a larger
spread than OJ. In the right panel, the iris data are now shown on two
vertical axes instead of four.

------------------------------------------------------------------------

# Iris data with rug coloring

Finally, we revisit the iris data with rug colors tied to an external
variable, first a continuous one (sepal length) and then a factor
(species).

```{r, fig.width = 5.5, fig.height = 5}
par(las = 1, mfrow = c(1, 1))
par(mar = c(2.2, 2.2, 2, 2.6))

bixplot(diris, main = "",
        rugNumeric = iris$Sepal.Length, colorbarW = 0.15,
        bodyCol = "grey80", modeCol = "grey80", las = 1)
title(main = "iris data with rug color by sepal length      ",
      line = 0.6)
par(oldpar)
```

Flowers with high sepal length (green) tend to have high petal length
and petal width. There is no such relation between sepal length and
sepal width.

```{r, fig.width = 5, fig.height = 5}
par(las = 1, mfrow = c(1, 1))
par(mar = c(2.2, 2.2, 2, 0.2))

rugFactorColors <- c("red", "blue", "forestgreen")
bixplot(diris, main = "", rugFactor = iris$Species,
        bodyCol = "grey80", modeCol = "grey80",
        rugFactorColors = rugFactorColors, las = 1)
title(main = "iris data with rug color by species", line = 0.6)
legend("topright",
       legend = c("setosa", "versicolor", "virginica"),
       fill = rugFactorColors, cex = 1)
par(oldpar)
```

The three species nicely match the clusters in `Petal.W` and `Petal.L`.
The colors inside `Sepal.L` are roughly similar, whereas `Sepal.W` again
behaves differently.

------------------------------------------------------------------------

# Titanic data

As a final example we use the Titanic dataset from the `classmap`
package. We compare the standardized age and log-fare distributions for
casualties and survivors using the `side = "both"` layout, and then
color the rug by cabin class.

```{r}
data(data_titanic, package = "classmap")
titanic         <- data_titanic[1:100, ]
titanic$logFare <- log(titanic$Fare + 1)
titanic$Pclass  <- as.factor(titanic$Pclass)
titanic[, c(5, 14)] <- scale(titanic[, c(5, 14)])

xt <- list(titanic[titanic$y == "casualty",  5],
           titanic[titanic$y == "survived",  5],
           titanic[titanic$y == "casualty",  14],
           titanic[titanic$y == "survived",  14])
names(xt) <- c("standardized Age.C",
               "standardized Age.S",
               "standardized log(Fare).C",
               "standardized log(Fare).S")
```

```{r, fig.width = 9.2, fig.height = 4.6}
par(las = 1, mfrow = c(1, 2))
par(mar = c(2.4, 2, 2, 1))

mycol <- c("coral2", "cadetblue3")

bixplot(xt, side = "both", main = "", col = mycol,
        stickLwd = 1, stickCol = "purple",
        boxW = 0.18, rugW = 0.10, las = 1)
title(main = "Titanic data by survival", line = 0.6)
legend("top", legend = c("casualty", "survived"),
       fill = mycol, cex = 0.9)

par(mar = c(2.4, 3, 2, 0.1))
bixplot(titanic[, c(5, 14)], main = "",
        rugFactor = titanic$Pclass, boxW = c(0.22, 0.18),
        names = c("standardized Age",
                  "standardized log(Fare)"), las = 1)
title(main = "Titanic data by cabin class", line = 0.6)
legend("top", title = "cabin class",
       legend = c("1", "2", "3"),
       fill = c("red", "blue", "forestgreen"), cex = 0.9)
par(oldpar)
```

In the left panel we see that the median of `Age` and especially its
mode are higher for the survivors, with a smaller effect of `Fare`. On
the right we see that both `Age` and `Fare` have an effect on cabin
class.

------------------------------------------------------------------------