## Created on Saturday, November 27, 2021 at 9:06am EST by WeekendEditor on WeekendEditorMachine.
## Copyright (c) 2021, SomeWeekendReading.  All rights reserved.  As if you care.

suppressPackageStartupMessages({                       # Ssshh!  Quiet in the library...
  ## NB: relative pathname works here because the launch script pushd's to this directory when loading
  toolsDir <- "../../tools"                            # Tools available from the author
  source(file.path(toolsDir, "pipeline-tools.r"))      # Pipeline construction tools
  source(file.path(toolsDir, "graphics-tools.r"))      # Various graphics tools

  library("plyr")                                      # For ldply()
  library("RCurl")                                     # For getURLContent()
  library("fitdistrplus")                              # For fitdist()
})                                                     #

##
## Collect stats on posts over time.
##
## We peruse the local snapshot of the repository, looking for post markdown files (_posts/*.md),
## then interrogating the hit count server for the coresponding post keys (api.countapi.xyz/get).
## We also delve into the _data/comments subdirectories, counting comments for each post.
##
## This carefully uses the get (rather than bump) interface, so it doesn't alter the counter values.
##

## Examples:
##
## > postData <- postStats()
##
## or, to make files for the year-in-review post:
##
## *** For 2022 l'etat du blog post; replace XXX.txt with transcript file based on post filename.
## > postData2022    <- postStats(year = 2022L, destDir = ".", txFile = "XXX.txt")
## > postDataOmnibus <- postStats(              destDir = ".", txFile = "XXX.txt")
##
## or, in an R session to prevent recomputation of earlier phases:
##
## > postData <- postStats(clear = c("postData", "postDataSaved", "plotDone")) # or a subset of those
##
## See also post-stats, an RScript file in this directory for use at the (Unix) command line:
## $ ./assets/post-stats
##

## *** Do regression in left half of image, splice in heatmap image in right half of image?
## *** Something like tobit regression of comments on hits?
## *** Other stats: # images for each post, byte count including images for each post, ...
## *** Graphics: quarterly boxplot of hits/comments, byte counts vs time, ...
## *** Show histogram sideways up against the y axis of the scatterplot?  (Log scale!)
##     See: https://www.r-bloggers.com/2012/09/example-10-3-enhanced-scatterplot-with-marginal-histograms/
postStats <- function(## Inputs
                      year          = NA,
                      clearVars     = c("postData", "postDataSaved", "plotDone", "summaryDone"),
                      ## Most of the time, these defaults are what you want
                      blogRoot      = "~/Documents/laboratory/someweekendreading.github.io",
                      postsDir      = file.path(blogRoot, "_posts"),
                      commentsDir   = file.path(blogRoot, "_data/comments"),
                      ## 2 capture groups: (1) for the post date, (2) for the post name in counters
                      postPatt      = "^([0-9]{4}-[0-9]{2}-[0-9]{2})-(.*)\\.md$",
                      commentPatt   = "^.*entry([0-9]+)\\.yml$",
                      ## 1 capture group: get the integer value from the JSON returned
                      jsonRegexp    = "^\\{.*\"value\":([0-9]+).*\\}$",
                      blogName      = "www.someweekendreading.blog",
                      countURL      = sprintf("https://api.countapi.xyz/get/%s", blogName),
                      ## 2022-Jan-14: SSL certificate is expired; emailed author
                      ## 2022-Feb-21: still expired; need command-line test case with curl?
                      ## 2022-Mar-02: still expired; wrote author again with curl examples:
                      ## $ curl --get https://api.countapi.xyz/get/www.someweekendreading.blog/.moderna-monkey-trial.
                      ## $ curl --insecure --get https://api.countapi.xyz/get/www.someweekendreading.blog/.moderna-monkey-trial.
                      ## Appears to be fixed as of 2022-Mar-27.
                      sslVerify     = TRUE, #FALSE,
                      hitStartDate  = as.Date("2021-Jul-15", format = "%Y-%b-%d"),
                      postStartDate = as.Date("2020-Jul-01", format = "%Y-%b-%d"),
                      today         = Sys.Date(),      # Today is when hits get measured
                      plotWidth     = 800,             # Shape of hits/comments vs time plot
                      plotHeight    = plotWidth,       # 2x2 array of plots, so why not square?
                      clGray        = gray(level = 0.80, alpha = 0.50),

                      ## Outputs (mostly, these defaults are what you want, except txFile and destDir)
                      destDir  = file.path(blogRoot, "_drafts"),
                      txFile   = sprintf("post-stats-%s.txt", format(today, format = "%Y-%b-%d")),
                      dataFile = if (is.null(txFile))  # If no transcript file
                                   NULL                # Then no data file
                                 else                  # Else derive data file from transcript file
                                   sub("^(.*)\\.txt$", "\\1.tsv", txFile),
                      plotFile = if (is.null(txFile))  # If no transcript file
                                   NULL                # Then no plot file
                                 else                  # Else derive plot file from transcript file
                                   sub("^(.*)\\.txt$", "\\1-1.png", txFile)) {

  dateYear    <- function(d) { as.integer(format(d, format = "%Y"))                           }
  dateYearEnd <- function(d) { as.Date(sprintf("%d-12-31", dateYear(d)), format = "%Y-%m-%d") }

  getPostData <- function(postStartDate, hitStartDate, today, postsDir, postPatt, jsonRegexp,
                          countURL, commentsDir, commentPatt, year) {
    cat(sprintf(paste("* Dates:",                      # First report date when counting started
                      "\n  - Date posts started:        %s",
                      "\n  - Date hit counting started: %s",
                      "\n  - Today:                     %s\n",
                      sep = ""),                       #  and today, so measuring blog hits btw dates
                format(postStartDate, format = "%Y-%b-%d"),
                format(hitStartDate, format = "%Y-%b-%d"),
                format(today,     format = "%Y-%b-%d")))
    postFiles <- list.files(path = postsDir, pattern = postPatt)
    cat(sprintf("\n* Found %d posts to check for hit counts.\n", length(postFiles)))
    postData <- ldply(postFiles, function(postFile) {  # Map over postFiles
      ## What the analytics page does:
      ## https://api.countapi.xyz/get/www.someweekendreading.blog/{{ page.url | escape | replace: "/", "." }}
      ##
      ## NB: page.url returns a string like "/name/", so this escapes and puts periods
      ## BOTH before AND after: ".name." -- a historical accident, but let's stick with it,
      ## since all our counters at countapi.xyz have that in them now.
      postRoot     <- sub(postPatt, "\\2", postFile)   # Extract root of post filename
      postComments <- file.path(commentsDir, postRoot) # Where comments are, if any
      data.frame(PostFile     = postFile,              # Construct dataframe row for this postFile
                 PostDate     = as.Date(sub(postPatt, "\\1", postFile), format = "%Y-%m-%d"),
                 PostComments = if (dir.exists(postComments))
                                  length(list.files(path = postComments, pattern = commentPatt))
                                else                   # Count comment files for this post
                                  0,                   #   most of the time there are none
                 PostHits     = as.integer(            # Interrogate hit server for this post
                                  sub(jsonRegexp, "\\1",
                                      ## I wish we could batch these, not slowly 1 by 1!
                                      getURLContent(sprintf("%s/%s",
                                                            countURL,
                                                            gsub("/", "\\.",
                                                                 URLencode(sprintf("/%s/",
                                                                                   postRoot)))),
                                                    .opts = RCurl::curlOptions(
                                                              ssl.verifypeer = sslVerify)))),
                 HitsStart    = hitStartDate,          # Keep track of when we started counting,
                 HitsEnd      = today)                 #  and today.  Counts are in that interval.
    }, .progress = progress_text())                    # Takes a minute; might as well show progress

    ## Yes, it would be faster to do this filter in the ldply() loop above, downloading only
    ## the counts in which we're interested.  On the other hand, the server will GC counts that
    ## haven't been accessed in 6 months, so doing it this way keeps all the keys current.
    ## (https://countapi.xyz says it must be "updated" every 6 months; I'm not sure if access via
    ## "get" constitutes "update"?)
    if (!is.na(year)) {                                # Wants to restrict to a single year
      stopifnot(is.integer(year)                &&     # Check the year is a reasonable number
                dateYear(postStartDate) <= year &&     #
                year                    <= dateYear(Sys.Date()))
      cat(sprintf("\n* Restricting data to just the year %d", year))
      minDate <- as.Date(sprintf("%d-Jan-01", year), format = "%Y-%b-%d")
      maxDate <- as.Date(sprintf("%d-Dec-31", year), format = "%Y-%b-%d")
      postData <- subset(postData, subset = minDate <= PostDate & PostDate <= maxDate)
    }                                                  # postData is now just for that year

    cat(sprintf("\n  - Result: %d rows (posts) of data:\n", nrow(postData)))
    showDataframeHeadTail(postData)                    # Show first few and last few rows

    postData                                           # Return data for posts
  }                                                    #

  savePostData <- function(destDir, dataFile, postData) {
    if (!is.null(dataFile)) {                          # If specified destination file
      f <- file.path(destDir, dataFile)                #  make full pathname
      saveDataframe(postData, f)                       #   then save results
      cat(sprintf("* Saved to %s.\n", f))              #   report it
      TRUE                                             #   and flag that it was done
    } else {                                           # Else doesn't want report saved
      cat(sprintf("* Results not saved.\n"))           #   so don't do that
      FALSE                                            #   and flag that it was NOT done
    }                                                  #
  }                                                    #

  plotDataVsTime <- function(postData, blogName, clGray, plotWidth, plotHeight, destDir, plotFile) {

    scatterplotWithLOESS <- function(postData, colName, clGray, log, doLegend, histBreaks,
                                     barPlot = FALSE) {#
      withPars(function() {                            # Set label orientation & add space @ bottom
        xlim      <- range(postData$"PostDate")        # Range of dates having posts
        xlim[[2]] <- dateYearEnd(xlim[[2]]) + 1        #  Pull up to year end of last post
        plot(panel.first = {                           # First do LOESS plot & 95%CL
          ## LOESS fit and 95% confidence interval as a function of time.  Imitated from example at:
          ## https://stackoverflow.com/questions/22717930/how-to-get-the-confidence-intervals-for-lowess-fit-using-r
          plx       <- predict(loess(as.formula(sprintf("%s ~ PostDays", colName)),
                                     data = transform(postData,
                                                      PostDays = as.numeric(
                                                          PostDate - min(postData$"PostDate")))),
                               se = TRUE)              # Get predictions and standard errors
          ucl       <- plx$"fit" + qt(0.975, plx$"df") * plx$"se"
          lcl       <- plx$"fit" - qt(0.975, plx$"df") * plx$"se"
          minPosLCL <- min(subset(lcl, subset = lcl > 0))
          lcl       <- pmax(lcl, minPosLCL)            # Clip negatives up to min pos (log scale!)
          polygon(x = c(postData$"PostDate", rev(postData$"PostDate")),
                  y = c(lcl, rev(ucl)),                # Polygon with shade of confidence limits
                  col = clGray, border = NA)           # Then LOESS line and CL borders (dashed)
          lines(postData$"PostDate", plx$"fit", lwd = 2)
          lines(postData$"PostDate", ucl,       lty = "dashed")
          lines(postData$"PostDate", lcl,       lty = "dashed")
        },                                             # Preliminaries done; now rest of plot:
        x = postData$"PostDate", y = postData[, colName], pch = 21, bg = "blue",
        xlim = xlim, ## ylim = c(1, max(postData[, colName])),
        main = sprintf("%s vs Time", colName), log = log, xaxt = "n", xlab = NA, #
        ylab = sprintf("%s (%s scale)", colName, if (nchar(log) > 0) "log" else "linear"))

        withPars(function() {                          # Horiz axis only: extra space for date labels
          minPostDate <- min(postData$"PostDate")      # Get date range to be covered,
          maxPostDate <- max(postData$"PostDate")      #   make 1st day of month for each
          axis.Date(side = 1, format = "%Y-%b-%d", labels = TRUE,
                    at = seq(from = minPostDate - as.integer(getDay(minPostDate)) + 1, #1st day
                             to   = maxPostDate - as.integer(getDay(maxPostDate)) + 1, # of month
                             by   = "2 months"))       # seq.Date() is pretty clever
          ## Don't really need horizontal axis label, since dates make it obvious it's the post date
        }, mgp = c(7.5, 0.5, 0))                       # Horizontal axis only: title, label, tick

        rug(postData$"PostDate", side = 1, col = "gray") # Poor man's marginal histograms
        rug(postData[, colName], side = 2, col = "gray") #  done as rug plots

        sapply(seq(from = dateYear(xlim[[1]]), to = dateYear(xlim[[2]])), function(yr) {
          abline(v = as.Date(sprintf("%4d-Jan-01", yr), format = "%Y-%b-%d"),
                 lty = "solid", col = "gray")          # Draw vertical gray line @ Jan 01 of each year
        })                                             #  between min post date and max post date
        ## Vertical dashed line when hit-counting was turned on (mid-2021)
        abline(v = postData[1, "HitsStart"], col = "gray", lty = "dashed")

        if (doLegend)                                  # Want to do legend?
          legend("topleft", bg = "antiquewhite", inset = c(0.01, 0.01), # 0.03 if on left, for y rug
                 pch    = c(21,                NA,                    22,        NA,       NA),
                 pt.bg  = c("blue",            NA,                    clGray,    NA,       NA),
                 pt.cex = c(1.5,               NA,                    3,         NA,       NA),
                 col    = c("black",           "black",               clGray,    "gray",   "gray"),
                 lty    = c(NA,                "solid",               "dashed",  "solid",  "dashed"),
                 lwd    = c(NA,                2,                     NA,        1,        1),
                 legend = c("Individual post", "LOESS central trend", "LOESS 95% CI",
                            "Year boundary", sprintf("Hits since: %s",
                                                     postData[1, "HitsStart"])))

      }, las = 3,                                      # Always vertical labels, both axes
         mar = c(7.5, 3, 2, 1))                        # Extra margin @ bottom for date labels

      if (barPlot)                                     # Barplot if too few vals to histogram
        barplot(height = table(postData[, colName]), col = "blue", space = 0,
                xlab = colName, ylab = sprintf("Freq(%s)", colName),
                main = sprintf("%s Distribution", colName))
      else {                                           # Else ordinary histogram

        hist(x = postData[, colName], col = "blue", breaks = histBreaks, prob = TRUE,
             xlab = colName, ylab = sprintf("Prob(%s)", colName),
             main = sprintf("%s Distribution", colName))
        vals <- round(seq(from = 1, to = max(postData[, colName]), length.out = 1000)) # NB: ints!

        ## Distributions: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Distributions.html
        ##
        ## We want a distribution which is:
        ## (a) univariate,
        ## (b) unimodal,
        ## (c) bdd below @ 0, and
        ## (d) has a heavy right tail
        ##
        ## lnorm, gamma, negbinomial, weibull, poisson all look sort of like candidates and
        ## initial tomfoolery with fitdist() reveals they converge to an estimate pretty readily.
        ##
        ## Other choices to try, from the common univariate distributions available in R:
        ##  beta, binom, cauchy, chisq, exp, f, geom, hyper, norm, t, unif
        ##
        ## Rule out based on (a): unif
        ## Rule out based on (c) and (d): cauchy, norm, t
        ## Rule out geom as it is special case of negbinom
        ## Rule out exp based on observed shape (no decrease to left of peak)
        ## Rule out beta, because values have to be in [0, 1]
        ## Rule out binomial: needs starting guess, and for large numbers it's essentially poisson
        ##  anyway, so asymptotically this has been tested.  (Confirmed numerically; the curves are
        ##  right on top of each other, but the binomial parameters are absurd, like N ~ 10^4.)
        ##
        ## So we have 8 candidates: lnorm, gamma, negbinomial, weibull, poisson
        ## to which we've added chisq, f, hyper
        ##
        ## Pick distribution(s) fitted with lowest BIC

        ## *** get hyper to converge by choosing "sensible" initial values
        ## Just thrashing here, attempting to guess parameter values:
        ##
        ## Must have k in 0..(m+n).
        ##
        ## Range of x is 0..k.  So k = max(postData[,"PostHits"]).
        ##
        ## p = m / (m + n)
        ## E[x]   = k p
        ## Var[x] = k p (1 - p) * (m+n-k)/(m+n-1)
        ##
        ## p <- 1/2                                # Arbitrary; implies m = n
        ## k <- mean(postData[, "PostHits"]) / p   # Gets mean right
        ##
        ## fithyper <<- fitdist(data = postData[, "PostHits"], distr = "hyper", method = "mle", start = list("m" = k/2, "n" = k/2, "k" = 2 * k))

        fitlnorm <<- fitdist(data = postData[, colName] + 1, distr = "lnorm", method = "mle")
        cat(sprintf("\n- Lognormal:\n")); print(fitlnorm)
        lines(x = vals, y = dlnorm(vals,               #
                                   meanlog = coef(fitlnorm)[["meanlog"]],
                                   sdlog   = coef(fitlnorm)[["sdlog"]]),
              lty = "solid", lwd = 2, col = "red")     #

        legend("topright", bg = "antiquewhite", inset = 0.01,
               pch    = c(22,      NA),
               pt.bg  = c("blue",  NA),
               pt.cex = c(2,       NA),
               lty    = c(NA,      "solid"),
               lwd    = c(NA,      2),
               col    = c("black", "red"),
               ## *** Put logmean and sdmean (with sd's) in legend too, or in a separate legend
               legend = c("Observations", sprintf("Lognormal BIC = %7.1f", fitlnorm$"bic")))

#        fitgamma <<- fitdist(data = postData[, colName], distr = "gamma", method = "mle")
#        cat(sprintf("\n- Gamma:\n")); print(coef(fitgamma))
#        lines(x = vals, y = dgamma(vals,               #
#                                   shape = coef(fitgamma)[["shape"]],
#                                   rate  = coef(fitgamma)[["rate"]]),
#              lty = "solid", lwd = 2, col = "green")   #

#        fitnbinom <<- fitdist(data = postData[, colName], distr = "nbinom", method = "mle")
#        cat(sprintf("\n- Negbinomial:\n")); print(coef(fitnbinom))
#        lines(x = vals, y = dnbinom(vals,              #
#                                    size = coef(fitnbinom)[["size"]],
#                                    mu   = coef(fitnbinom)[["mu"]]),
#              lty = "solid", lwd = 2, col = "black")   #

#        fitweibull <<- fitdist(data = postData[, colName], distr = "weibull", method = "mle")
#        cat(sprintf("\n- Weibull:\n")); print(coef(fitweibull))
#        lines(x = vals, y = dweibull(vals,             #
#                                     shape = coef(fitweibull)[["shape"]],
#                                     scale = coef(fitweibull)[["scale"]]),
#              lty = "solid", lwd = 2, col = "gray")    #

#        fitf     <<- fitdist(data = postData[, colName], distr = "f",     method = "mle",
#                             ## Again, a random stab in the dark at the number of degress of freedom
#                             ##  both being the number of posts.  It converges to df1 being a bit more,
#                             ##  and pushes df2 down to between 0 and 1.
#                             ## However, if you cut the initial guesses, the outcome changes wildly.
#                             ##  So it's much more sensitive to my weird guesses at initial conditions
#                             ##  than chisquare!
#                             start = list("df1" = nrow(postData), "df2" = nrow(postData)))
#        cat(sprintf("\n- F:\n")); print(coef(fitf))
#        lines(x = vals, y = df(vals, df1 = coef(fitf)[["df1"]], df2 = coef(fitf)[["df2"]]),
#              lty = "solid", lwd = 2, col = "yellow")    #

#       fitchisq <<- fitdist(data = postData[, colName], distr = "chisq", method = "mle",
#                             ## Just took a guess @ number of degrees of freedom = number of posts.
#                             ##   It ends up fitting to a lot less than that.  Just for robustness
#                             ##   testing, tried HALF the number of posts; converged to same answer.
#                             ## Ignoring the non-centrality parameter; should figre it out!  Took a
#                             ##   couple guesses, e.g., mean number of hits.  Never converged.
#                             ## *** NB: special case of Gamma; why doesn't it do as well?
#                             start = list("df" = nrow(postData)))
#        cat(sprintf("\n- Chi squared:\n")); print(coef(fitchisq))
#        lines(x = vals, y = dchisq(vals, df = coef(fitchisq)[["df"]]),
#              lty = "solid", lwd = 2, col = "blue")    #

#        fitpois <<- fitdist(data = postData[, colName], distr = "pois", method = "mle")
#        cat(sprintf("\n- Poisson:\n")); print(coef(fitpois))
#        lines(x = vals, y = dpois(vals, lambda  = coef(fitpois)[["lambda"]]),
#              lty = "solid", lwd = 2, col = "orange")  #

#        legend("topright", bg = "antiquewhite", inset = 0.01,
#               pch    = c(22,      NA,      NA,      NA,      NA,      NA,       NA, NA),
#               pt.bg  = c("blue",  NA,      NA,      NA,      NA,      NA,       NA, NA),
#               pt.cex = c(2,       NA,      NA,      NA,      NA,      NA,       NA, NA),
#               lty    = c(NA,      "solid", "solid", "solid", "solid", "solid",  "solid", "solid"),
#               lwd    = c(NA,      2,       2,       2,       2,       2,        2, 2),
#               col    = c("black", "red",   "green", "black",  "gray", "yellow", "blue", "orange"),
#               legend = c("Observations",              #
#                          sprintf("Lognormal    BIC =  %7.1f",        fitlnorm$"bic"),
#                          sprintf("Gamma        BIC =  %7.1f",        fitgamma$"bic"),
#                          sprintf("Negbinomial BIC =  %7.1f",         fitnbinom$"bic"),
#                          sprintf("Weibull         BIC =  %7.1f",     fitweibull$"bic"),
#                          sprintf("F                  BIC  =  %7.1f", fitf$"bic"),
#                          sprintf("Chi squared BIC =   %7.1f",        fitchisq$"bic"),
#                          sprintf("Poisson        BIC = %7.1f",       fitpois$"bic")))
#
#       bics <- data.frame(Distribution = c("Lognormal", "Gamma", "Weibull", "Negbinomial", "Poisson",
#                                            "Chi Squared", "F"),
#                           BIC          = c(fitlnorm$"bic", fitgamma$"bic", fitweibull$"bic",
#                                            fitnbinom$"bic", fitpois$"bic", fitchisq$"bic",
#                                            fitf$"bic"))
#        bics <- bics[order(bics$"BIC"), ]              # Sort so best is at top of table
#        cat(sprintf("\n* Bayes information criteria for distribution of %s:\n", colName))
#        print(bics)                                    # Show table to transcript

        cat("\n")                                      #

      }                                                #
    }                                                  #

    f <- if (is.null(plotFile)) NULL else file.path(destDir, plotFile)
    withPNG(f, plotWidth, plotHeight, FALSE, function() {
      withPars(function() {                            # Save/restore graphics parameters

        scatterplotWithLOESS(postData, "PostHits",     clGray, "y", FALSE, 20)
        scatterplotWithLOESS(postData, "PostComments", clGray, "",  TRUE,  NULL, barPlot = TRUE)

        title(main = sprintf("%s Hits & Comments %s: %d posts",
                             blogName,                 # Overall title of all plots
                             format(postData[1, "HitsEnd"], format = "%Y-%b-%d"),
                             nrow(postData)),          # Include number of posts
              outer = TRUE, cex.main = 1.5)            # It's in the top outer margin

      }, pty   = "m",                                  # Maximal plotting area
         bg    = "white",                              # Background
         ps    = 16,                                   # Larger type size for file capture
         mfrow = c(2, 2),                              # 2 x 2 array of plots
         oma   = c(0, 0, 1, 0),                        # Outer margin @ top for overall title
         mar   = c(3, 3, 2, 1),                        # Pull in on margins
         mgp   = c(1.7, 0.5, 0))                       # Axis title, label, tick
    })                                                 # Done with file capture
    cat(sprintf("* Data vs time plot: %s.", f))        # Capture filename to transcript

    qs  <- quantile(postData$"PostHits", probs = seq(from = 0.00, to = 1.00, by = 0.10))
    foo <- transform(postData,                         # Original data with hit quantile column
                     PostHitsDecile = cut(postData$"PostHits",
                                          breaks         = qs,
                                          labels         = names(qs)[-11],
                                          include.lowest = TRUE,
                                          ordered_result = TRUE))
    tbl <- table(foo$"PostComments", foo$"PostHitsDecile")
    cat(sprintf("\n  - Table of number of comments/post(rows) vs number of hits/post (cols):\n"))
    print(tbl)                                         # Table of comments x hits, for bicluster

    f2  <- if (is.null(f)) NULL else sub("^(.*)-1\\.png$", "\\1-2.png", f)
    withPNG(f2, plotWidth / 2, plotHeight / 2, FALSE, function() {
      legendFrac <- 0.20                               # How much space on left for color legend
      withPars(function() {                            # Save/restore graphics parameters
        ctRange <- range(tbl)                          # Range of counts in table
        nColors <- min(diff(ctRange) + 1, 256)         # Colors used in bicluster
        colors  <- rev(makeFDRColors(nColors, maxFDRColored = 1.0, baseColor = "blue"))

        heatmapRespectFALSE(tbl, scale = "none", col = colors,
                            ## 2ndary sort key by comment # on rows and by decile on cols
                            Rowv = -as.integer(rownames(tbl)),
                            Colv = -as.integer(gsub("^([0-9]*)%$", "\\1", colnames(tbl))),
                            margins = c(1.5 * max(nchar(colnames(tbl))),
                                        2.5 * max(nchar(rownames(tbl)))),
                            main = "Comment/Hit Bicluster", xlab = "Hit Decile", ylab = "Comments",
                            add.expr = {box(which = "plot")})

        par(omd = c(0, legendFrac, 0, 1), pty = "m", mar = c(1, 3, 1, 1), new = TRUE)
        image(x    = 0,                                # draw the color bar
              y    = seq(from = ctRange[[1]], to = ctRange[[2]], along.with = colors),
              z    = matrix(1 : length(colors), ncol = length(colors)),
              cex.axis = 0.9,                          # Smaller labels
              ylim = ctRange,                          # Show range of POSSIBLE correlations
              col  = colors,                           # Heatmap colors
              xaxt = "n", ylab = "Number of Posts", xlab = NA)
        box(which = "plot")                            # Box around the color legend

      }, pty = "m",                                    # Maximal plotting area
         bg  = "white",                                # White background
         omd = c(legendFrac, 1, 0, 0.99),              # Leave room for color bar legend
         ps  = 16,                                     # Larger type for file capture
         mar = c(1, 1, 0.9, 1),                        # Margins: title inexplicably cropped @ top?
         mgp = c(1.7, 0.5, 0))                         # Axis title, label, tick
    })                                                 #
    cat(sprintf("\n\n* Bicluster: %s\n", f2))          # Capture dest file to transcript

    cat(sprintf("\n\n* Correlation test between hits and comments:\n"))
    print(cor.test(postData$"PostHits", postData$"PostComments", method = "pearson"))
    ## Spearman exact = FALSE to avoid warning: "Cannot compute exact p-value with ties"
    print(cor.test(postData$"PostHits", postData$"PostComments", method = "spearman", exact = FALSE))

    f3 <- if (is.null(f)) NULL else sub("^(.*)\\-1.png$", "\\1-3.png", f)
    withPNG(f3, plotWidth / 2, plotHeight / 2, FALSE, function() {
      ys <- postData$"PostComments"                    # Regression of comments on hits in
      withPars(function() {                            #  scatterplot of comments vs hits
        scatterplotWithDensities(xs = postData$"PostHits",
                                 ys = ys,              #
                                 xlab = "PostHits (log scale)", ylab = "PostComments",
                                 main = "Hits vs Comments",
                                 bg = "blue", log = "x", nLevels = 20, kdeN = 100,
                                 ## Too few comment levels, so give comment bandwidth manually;
                                 ## heuristic derived from bandwidth.nrd()
                                 kdeH = c(bandwidth.nrd(postData$"PostHits"),
                                          4 * 1.06 * sqrt(var(ys)) * length(ys)^(-1/5)),
                                 regressionColor = NULL) ## Figure out how to handle log scale

        mdl    <- lm(PostComments ~ log(PostHits), data = postData)
        mdlSum <- summary(mdl)                         # Extract details of regression results
        cat("\n\n"); print(mdlSum)                     # Show regression stats
        ## *** Figure out untf arg, make scatterPlotWithDensities() do this right
        ## > abline(reg = mdl, lty = "dashed", col = "red", lwd = 2, untf = TRUE)
        ## > abline(reg = mdl, lty = "dashed", col = "red", lwd = 2, untf = FALSE)
        foo <- data.frame(x = postData$"PostHits", y = predict(mdl))
        foo <- foo[order(foo$"x"), ]                   # Plot log-linear fit
        lines(x = foo$"x", y = foo$"y", lty = "dashed", lwd = 2, col = "red")
        coefData <- coef(mdlSum)                       # Extract coeffs & p-values from model
        legend("topleft", bg = "antiquewhite", inset = 0.01, title = "Regression", cex = 0.8,
               legend = c(sprintf("Slope = %.2f (t-stat p ~ %.1e)",
                                  coefData[["log(PostHits)", "Estimate"]],
                                  coefData[["log(PostHits)", "Pr(>|t|)"]]),
                          sprintf("F-stat p ~ %.1e, Adj R^2 ~ %.1f%%",
                                  pf(mdlSum$"fstatistic"[[1]],
                                     mdlSum$"fstatistic"[[2]],
                                     mdlSum$"fstatistic"[[3]],
                                     lower.tail = FALSE),
                                  100.0 * mdlSum$"adj.r.squared")))

      }, pty = "m",                                    #
         bg  = "white",                                #
         ps  = 16,                                     #
         mar = c(3, 3, 2, 1),                          #
         mgp = c(1.7, 0.5, 0))                         #
    })                                                 #
    cat(sprintf("\n\n* Scatterplot: %s.\n", f3))       # Capture filename to transcript

    TRUE                                               # Flag that it was done
  }                                                    #

  ## *** Add capability for summary by quarters instead of years
  ## *** Add number of comments not from me for each year
  ## *** Add number of comments from me for each year
  ## *** Total unique commenters for each year (and hand-collapse known spelling variations,
  ##     compare with unique MD5 hashes of emails)
  summarizePostFrequency <- function(postData, commentsDir, commentPatt) {
    ## Returns a dataframe summarizing post frequency by year and overall, e.g.:
    ##    Year NPosts NDays NComments DaysPerPost DaysPerComment CommentsPerPost
    ## 1  2020     41   184        21        4.49           8.76            0.51
    ## 2  2021    111   365        58        3.29           6.29            0.52
    ## 3  2022     22    51        17        2.32           3.00            0.77
    ## 4 Total    174   600        96        3.45           6.25            0.55

    commentYears <- function(commentsDir, commentPatt) {

      unixTimestampYear <- function(ts) { dateYear(as.Date(as.POSIXct(ts, origin = "1970-01-01"))) }

      ## NB: Timestamps are 13-digit integers, apparently msec since Unix epoch 1970-01-01.
      ##     (It would be more traditional to do it in seconds, and have a 10-digit integer,
      ##     which would fit into a 32-bit word, but we have some non-traditionalists here.)
      ##
      ##     So we convert to double-float via as.numeric(), divide by 1000msec/sec to get the
      ##     seconds elapsed, then convert THAT to integer via as.integer().  Then convert THAT
      ##     to a Date and extract the year.  Phew!
      sapply(list.files(path = commentsDir, patt = commentPatt, recursive = TRUE), function(cf) {
        unixTimestampYear(as.integer(as.numeric(sub(commentPatt, "\\1", cf)) / 1000))
      })                                               #
    }                                                  #

    blogDays <- function(yr, blogStartYr = 2020) {     # How many blogging days in a given year?

      isLeapYear <- function(yr) {                     # Is this year a leap year?
        ## Gregorian calendar leap year iff:
        ## - divisible by 4 and NOT by 100, or
        ## - divisible by 4 AND by 100 AND by 400
        yr <- as.integer(yr)                           # Defensive programming: convert to integer
        stopifnot(is.integer(yr) && yr >= 1583)        # First full year of Gregorian calendar
        (yr %% 4 == 0) && (!(yr %% 100 == 0) || (yr %% 400 == 0))
      }                                                # Returns TRUE if yr is a leap year

      currDate <- Sys.Date()                           # Current date
      currYr   <- as.integer(format(currDate, "%Y"))   # Current year and
      stopifnot(blogStartYr <= yr && yr <= currYr)     # Signal if year is too far back, or future

      if (yr == blogStartYr)                           # If first year of blogging: Jul 01 to Dec 31
        184                                            #  was 184 days
      else if (yr < currYr) {                          # Elseif between then & now, days in year:
        if (isLeapYear(yr)) 366 else 365               #  take account of leap year
      } else                                           # Else it's the current year
        as.integer(strftime(currDate, "%j"))           #  so return number of days so far
    }                                                  #

    yearComments <- table(commentYears(commentsDir, commentPatt))
    yearComments <- data.frame(Year      = as.integer(dimnames(yearComments)[[1]]),
                               NComments = as.vector(yearComments))
    foo <- transform(transform(merge(ddply(transform(postData,
                                                     Year = as.integer(
                                                              sprintf("%s", dateYear(PostDate)))),
                                           "Year",     # Map over year subsets of the data
                                           function(ydf) {
                                             data.frame(NPosts = nrow(ydf),
                                                        NDays  = blogDays(ydf[[1, "Year"]]))
                                           }),         #
                                     yearComments, by = "Year", all = TRUE),
                               NComments = ifelse(is.na(NComments), 0, NComments)),
                     DaysPerPost     = round(NDays     / NPosts,    digits = 2),
                     DaysPerComment  = round(NDays     / NComments, digits = 2),
                     CommentsPerPost = round(NComments / NPosts,    digits = 2))
    rbind(foo,                                         # Add another row at the bottom, which has
          transform(data.frame(Year      = "Total",    #  totals of posts & days, & overall days/post
                               NPosts    = sum(foo$"NPosts"),
                               NDays     = sum(foo$"NDays"),
                               NComments = sum(foo$"NComments")),
                    DaysPerPost     = round(NDays / NPosts,     digits = 2),
                    DaysPerComment  = round(NDays / NComments,  digits = 2),
                    CommentsPerPost = round(NComments / NPosts, digits = 2)))
  }                                                    #

  withTranscript(postsDir, destDir, txFile, "Blog Post Hit Counts", function() {

    if (!is.null(clearVars)) {                         # Wants completely virgin calculation?
      heraldPhase("Clearing out previous results")     # Delete globals holding previous results
      removePreviousResults(clearVars)                 # Remove the requested globals
    }                                                  #

    heraldPhase("Getting hit count for each post")     # Announce what we're doing
    maybeAssign("postData", function() {               # Collect hit count for each post
      getPostData(postStartDate, hitStartDate, today, postsDir, postPatt, jsonRegexp, countURL,
                  commentsDir, commentPatt, year)      #
    })                                                 # Done retrieving counts

    heraldPhase("Saving results")                      # Save the results, maybe
    maybeAssign("postDataSaved", function() { savePostData(destDir, dataFile, postData) })

    heraldPhase("Plotting hits vs time")               # Plot hits vs time and probability
    maybeAssign("plotDone", function() {               #   distribution of hits
      plotDataVsTime(postData, blogName, clGray, plotWidth, plotHeight, destDir, plotFile)
    })                                                 #

    heraldPhase("Summarizing Post Data")               # Summarize post/comment/hit counts
    maybeAssign("summaryDone", function() {            #  in a table of stats by year
      print(summarizePostFrequency(postData, commentsDir, commentPatt))
      TRUE                                             # Flag that it was done
    })                                                 #

    invisible(postData)                                # Return results invisibly (also in global var)

  })                                                   # End withTranscript()
}                                                      #