certmagic/maintain.go

// Copyright 2015 Matthew Holt
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package certmagic

import (
	"context"
	"crypto/x509"
	"encoding/json"
	"encoding/pem"
	"errors"
	"fmt"
	"io/fs"
	"path"
	"runtime"
	"strings"
	"time"

	"github.com/mholt/acmez/v2/acme"
	"go.uber.org/zap"
	"golang.org/x/crypto/ocsp"
)

// maintainAssets is a permanently-blocking function
// that loops indefinitely and, on a regular schedule, checks
// certificates for expiration and initiates a renewal of certs
// that are expiring soon. It also updates OCSP stapling. It
// should only be called once per cache. Panics are recovered,
// and if panicCount < 10, the function is called recursively,
// incrementing panicCount each time. Initial invocation should
// start panicCount at 0.
func (certCache *Cache) maintainAssets(panicCount int) {
	log := certCache.logger.Named("maintenance")
	log = log.With(zap.String("cache", fmt.Sprintf("%p", certCache)))

	defer func() {
		if err := recover(); err != nil {
			buf := make([]byte, stackTraceBufferSize)
			buf = buf[:runtime.Stack(buf, false)]
			log.Error("panic", zap.Any("error", err), zap.ByteString("stack", buf))
			if panicCount < 10 {
				certCache.maintainAssets(panicCount + 1)
			}
		}
	}()

	certCache.optionsMu.RLock()
	renewalTicker := time.NewTicker(certCache.options.RenewCheckInterval)
	ocspTicker := time.NewTicker(certCache.options.OCSPCheckInterval)
	certCache.optionsMu.RUnlock()

	log.Info("started background certificate maintenance")

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	for {
		select {
		case <-renewalTicker.C:
			err := certCache.RenewManagedCertificates(ctx)
			if err != nil {
				log.Error("renewing managed certificates", zap.Error(err))
			}
		case <-ocspTicker.C:
			certCache.updateOCSPStaples(ctx)
		case <-certCache.stopChan:
			renewalTicker.Stop()
			ocspTicker.Stop()
			log.Info("stopped background certificate maintenance")
			close(certCache.doneChan)
			return
		}
	}
}

// RenewManagedCertificates renews managed certificates,
// including ones loaded on-demand. Note that this is done
// automatically on a regular basis; normally you will not
// need to call this. This method assumes non-interactive
// mode (i.e. operating in the background).
func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
	log := certCache.logger.Named("maintenance")

	// configs will hold a map of certificate hash to the config
	// to use when managing that certificate
	configs := make(map[string]*Config)

	// we use the queues for a very important reason: to do any and all
	// operations that could require an exclusive write lock outside
	// of the read lock! otherwise we get a deadlock, yikes. in other
	// words, our first iteration through the certificate cache does NOT
	// perform any operations--only queues them--so that more fine-grained
	// write locks may be obtained during the actual operations.
	var renewQueue, reloadQueue, deleteQueue, ariQueue certList

	certCache.mu.RLock()
	for certKey, cert := range certCache.cache {
		if !cert.managed {
			continue
		}

		// the list of names on this cert should never be empty... programmer error?
		if cert.Names == nil || len(cert.Names) == 0 {
			log.Warn("certificate has no names; removing from cache", zap.String("cert_key", certKey))
			deleteQueue = append(deleteQueue, cert)
			continue
		}

		// get the config associated with this certificate
		cfg, err := certCache.getConfig(cert)
		if err != nil {
			log.Error("unable to get configuration to manage certificate; unable to renew",
				zap.Strings("identifiers", cert.Names),
				zap.Error(err))
			continue
		}
		if cfg == nil {
			// this is bad if this happens, probably a programmer error (oops)
			log.Error("no configuration associated with certificate; unable to manage",
				zap.Strings("identifiers", cert.Names))
			continue
		}
		if cfg.OnDemand != nil {
			continue
		}

		// ACME-specific: see if if ACME Renewal Info (ARI) window needs refreshing
		if !cfg.DisableARI && cert.ari.NeedsRefresh() {
			configs[cert.hash] = cfg
			ariQueue = append(ariQueue, cert)
		}

		// if time is up or expires soon, we need to try to renew it
		if cert.NeedsRenewal(cfg) {
			configs[cert.hash] = cfg

			// see if the certificate in storage has already been renewed, possibly by another
			// instance that didn't coordinate with this one; if so, just load it (this
			// might happen if another instance already renewed it - kinda sloppy but checking disk
			// first is a simple way to possibly drastically reduce rate limit problems)
			storedCertNeedsRenew, err := cfg.managedCertInStorageNeedsRenewal(ctx, cert)
			if err != nil {
				// hmm, weird, but not a big deal, maybe it was deleted or something
				log.Warn("error while checking if stored certificate is also expiring soon",
					zap.Strings("identifiers", cert.Names),
					zap.Error(err))
			} else if !storedCertNeedsRenew {
				// if the certificate does NOT need renewal and there was no error, then we
				// are good to just reload the certificate from storage instead of repeating
				// a likely-unnecessary renewal procedure
				reloadQueue = append(reloadQueue, cert)
				continue
			}

			// the certificate in storage has not been renewed yet, so we will do it
			// NOTE: It is super-important to note that the TLS-ALPN challenge requires
			// a write lock on the cache in order to complete its challenge, so it is extra
			// vital that this renew operation does not happen inside our read lock!
			renewQueue.insert(cert)
		}
	}
	certCache.mu.RUnlock()

	// Update ARI, and then for any certs where the ARI window changed,
	// be sure to queue them for renewal if necessary
	for _, cert := range ariQueue {
		cfg := configs[cert.hash]
		cert, changed, err := cfg.updateARI(ctx, cert, log)
		if err != nil {
			log.Error("updating ARI", zap.Error(err))
		}
		if changed && cert.NeedsRenewal(cfg) {
			// it's theoretically possible that another instance already got the memo
			// on the changed ARI and even renewed the cert already, and thus doing it
			// here is wasteful, but I have never heard of this happening in reality,
			// so to save some cycles for now I think we'll just queue it for renewal
			// (notice how we use 'insert' to avoid duplicates, in case it was already
			// scheduled for renewal anyway)
			renewQueue.insert(cert)
		}
	}

	// Reload certificates that merely need to be updated in memory
	for _, oldCert := range reloadQueue {
		timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC())
		log.Info("certificate expires soon, but is already renewed in storage; reloading stored certificate",
			zap.Strings("identifiers", oldCert.Names),
			zap.Duration("remaining", timeLeft))

		cfg := configs[oldCert.hash]

		// crucially, this happens OUTSIDE a lock on the certCache
		_, err := cfg.reloadManagedCertificate(ctx, oldCert)
		if err != nil {
			log.Error("loading renewed certificate",
				zap.Strings("identifiers", oldCert.Names),
				zap.Error(err))
			continue
		}
	}

	// Renewal queue
	for _, oldCert := range renewQueue {
		cfg := configs[oldCert.hash]
		err := certCache.queueRenewalTask(ctx, oldCert, cfg)
		if err != nil {
			log.Error("queueing renewal task",
				zap.Strings("identifiers", oldCert.Names),
				zap.Error(err))
			continue
		}
	}

	// Deletion queue
	certCache.mu.Lock()
	for _, cert := range deleteQueue {
		certCache.removeCertificate(cert)
	}
	certCache.mu.Unlock()

	return nil
}

func (certCache *Cache) queueRenewalTask(ctx context.Context, oldCert Certificate, cfg *Config) error {
	log := certCache.logger.Named("maintenance")

	timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC())
	log.Info("certificate expires soon; queuing for renewal",
		zap.Strings("identifiers", oldCert.Names),
		zap.Duration("remaining", timeLeft))

	// Get the name which we should use to renew this certificate;
	// we only support managing certificates with one name per cert,
	// so this should be easy.
	renewName := oldCert.Names[0]

	// queue up this renewal job (is a no-op if already active or queued)
	jm.Submit(cfg.Logger, "renew_"+renewName, func() error {
		timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC())
		log.Info("attempting certificate renewal",
			zap.Strings("identifiers", oldCert.Names),
			zap.Duration("remaining", timeLeft))

		// perform renewal - crucially, this happens OUTSIDE a lock on certCache
		err := cfg.RenewCertAsync(ctx, renewName, false)
		if err != nil {
			if cfg.OnDemand != nil {
				// loaded dynamically, remove dynamically
				certCache.mu.Lock()
				certCache.removeCertificate(oldCert)
				certCache.mu.Unlock()
			}
			return fmt.Errorf("%v %v", oldCert.Names, err)
		}

		// successful renewal, so update in-memory cache by loading
		// renewed certificate so it will be used with handshakes
		_, err = cfg.reloadManagedCertificate(ctx, oldCert)
		if err != nil {
			return ErrNoRetry{fmt.Errorf("%v %v", oldCert.Names, err)}
		}
		return nil
	})

	return nil
}

// updateOCSPStaples updates the OCSP stapling in all
// eligible, cached certificates.
//
// OCSP maintenance strives to abide the relevant points on
// Ryan Sleevi's recommendations for good OCSP support:
// https://gist.github.com/sleevi/5efe9ef98961ecfb4da8
func (certCache *Cache) updateOCSPStaples(ctx context.Context) {
	logger := certCache.logger.Named("maintenance")

	// temporary structures to store updates or tasks
	// so that we can keep our locks short-lived
	type ocspUpdate struct {
		rawBytes []byte
		parsed   *ocsp.Response
	}
	type updateQueueEntry struct {
		cert           Certificate
		certHash       string
		lastNextUpdate time.Time
		cfg            *Config
	}
	type renewQueueEntry struct {
		oldCert Certificate
		cfg     *Config
	}
	updated := make(map[string]ocspUpdate)
	var updateQueue []updateQueueEntry // certs that need a refreshed staple
	var renewQueue []renewQueueEntry   // certs that need to be renewed (due to revocation)

	// obtain brief read lock during our scan to see which staples need updating
	certCache.mu.RLock()
	for certHash, cert := range certCache.cache {
		// no point in updating OCSP for expired or "synthetic" certificates
		if cert.Leaf == nil || cert.Expired() {
			continue
		}
		cfg, err := certCache.getConfig(cert)
		if err != nil {
			logger.Error("unable to get automation config for certificate; maintenance for this certificate will likely fail",
				zap.Strings("identifiers", cert.Names),
				zap.Error(err))
			continue
		}
		// always try to replace revoked certificates, even if OCSP response is still fresh
		if certShouldBeForceRenewed(cert) {
			renewQueue = append(renewQueue, renewQueueEntry{
				oldCert: cert,
				cfg:     cfg,
			})
			continue
		}
		// if the status is not fresh, get a new one
		var lastNextUpdate time.Time
		if cert.ocsp != nil {
			lastNextUpdate = cert.ocsp.NextUpdate
			if cert.ocsp.Status != ocsp.Unknown && freshOCSP(cert.ocsp) {
				// no need to update our staple if still fresh and not Unknown
				continue
			}
		}
		updateQueue = append(updateQueue, updateQueueEntry{cert, certHash, lastNextUpdate, cfg})
	}
	certCache.mu.RUnlock()

	// perform updates outside of any lock on certCache
	for _, qe := range updateQueue {
		cert := qe.cert
		certHash := qe.certHash
		lastNextUpdate := qe.lastNextUpdate

		if qe.cfg == nil {
			// this is bad if this happens, probably a programmer error (oops)
			logger.Error("no configuration associated with certificate; unable to manage OCSP staples",
				zap.Strings("identifiers", cert.Names))
			continue
		}

		err := stapleOCSP(ctx, qe.cfg.OCSP, qe.cfg.Storage, &cert, nil)
		if err != nil {
			if cert.ocsp != nil {
				// if there was no staple before, that's fine; otherwise we should log the error
				logger.Error("stapling OCSP",
					zap.Strings("identifiers", cert.Names),
					zap.Error(err))
			}
			continue
		}

		// By this point, we've obtained the latest OCSP response.
		// If there was no staple before, or if the response is updated, make
		// sure we apply the update to all names on the certificate if
		// the status is still Good.
		if cert.ocsp != nil && cert.ocsp.Status == ocsp.Good && (lastNextUpdate.IsZero() || lastNextUpdate != cert.ocsp.NextUpdate) {
			logger.Info("advancing OCSP staple",
				zap.Strings("identifiers", cert.Names),
				zap.Time("from", lastNextUpdate),
				zap.Time("to", cert.ocsp.NextUpdate))
			updated[certHash] = ocspUpdate{rawBytes: cert.Certificate.OCSPStaple, parsed: cert.ocsp}
		}

		// If the updated staple shows that the certificate was revoked, we should immediately renew it
		if certShouldBeForceRenewed(cert) {
			qe.cfg.emit(ctx, "cert_ocsp_revoked", map[string]any{
				"subjects":    cert.Names,
				"certificate": cert,
				"reason":      cert.ocsp.RevocationReason,
				"revoked_at":  cert.ocsp.RevokedAt,
			})

			renewQueue = append(renewQueue, renewQueueEntry{
				oldCert: cert,
				cfg:     qe.cfg,
			})
		}
	}

	// These write locks should be brief since we have all the info we need now.
	for certKey, update := range updated {
		certCache.mu.Lock()
		if cert, ok := certCache.cache[certKey]; ok {
			cert.ocsp = update.parsed
			cert.Certificate.OCSPStaple = update.rawBytes
			certCache.cache[certKey] = cert
		}
		certCache.mu.Unlock()
	}

	// We attempt to replace any certificates that were revoked.
	// Crucially, this happens OUTSIDE a lock on the certCache.
	for _, renew := range renewQueue {
		_, err := renew.cfg.forceRenew(ctx, logger, renew.oldCert)
		if err != nil {
			logger.Info("forcefully renewing certificate due to REVOKED status",
				zap.Strings("identifiers", renew.oldCert.Names),
				zap.Error(err))
		}
	}
}

// storageHasNewerARI returns true if the configured storage has ARI that is newer
// than that of a certificate that is already loaded, along with the value from
// storage.
func (cfg *Config) storageHasNewerARI(ctx context.Context, cert Certificate) (bool, acme.RenewalInfo, error) {
	storedCertData, err := cfg.loadStoredACMECertificateMetadata(ctx, cert)
	if err != nil || storedCertData.RenewalInfo == nil {
		return false, acme.RenewalInfo{}, err
	}
	// prefer stored info if it has a window and the loaded one doesn't,
	// or if the one in storage has a later RetryAfter (though I suppose
	// it's not guaranteed, typically those will move forward in time)
	if (!cert.ari.HasWindow() && storedCertData.RenewalInfo.HasWindow()) ||
		(cert.ari.RetryAfter == nil || storedCertData.RenewalInfo.RetryAfter.After(*cert.ari.RetryAfter)) {
		return true, *storedCertData.RenewalInfo, nil
	}
	return false, acme.RenewalInfo{}, nil
}

// loadStoredACMECertificateMetadata loads the stored ACME certificate data
// from the cert's sidecar JSON file.
func (cfg *Config) loadStoredACMECertificateMetadata(ctx context.Context, cert Certificate) (acme.Certificate, error) {
	metaBytes, err := cfg.Storage.Load(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0]))
	if err != nil {
		return acme.Certificate{}, fmt.Errorf("loading cert metadata: %w", err)
	}

	var certRes CertificateResource
	if err = json.Unmarshal(metaBytes, &certRes); err != nil {
		return acme.Certificate{}, fmt.Errorf("unmarshaling cert metadata: %w", err)
	}

	var acmeCert acme.Certificate
	if err = json.Unmarshal(certRes.IssuerData, &acmeCert); err != nil {
		return acme.Certificate{}, fmt.Errorf("unmarshaling potential ACME issuer metadata: %v", err)
	}

	return acmeCert, nil
}

// updateARI updates the cert's ACME renewal info, first by checking storage for a newer
// one, or getting it from the CA if needed. The updated info is stored in storage and
// updated in the cache. The certificate with the updated ARI is returned. If true is
// returned, the ARI window or selected time has changed, and the caller should check if
// the cert needs to be renewed now, even if there is an error.
//
// This will always try to ARI without checking if it needs to be refreshed. Call
// NeedsRefresh() on the RenewalInfo first, and only call this if that returns true.
func (cfg *Config) updateARI(ctx context.Context, cert Certificate, logger *zap.Logger) (updatedCert Certificate, changed bool, err error) {
	logger = logger.With(
		zap.Strings("identifiers", cert.Names),
		zap.String("cert_hash", cert.hash),
		zap.String("ari_unique_id", cert.ari.UniqueIdentifier),
		zap.Time("cert_expiry", cert.Leaf.NotAfter))

	updatedCert = cert
	oldARI := cert.ari

	// synchronize ARI fetching; see #297
	lockName := "ari_" + cert.ari.UniqueIdentifier
	if err := acquireLock(ctx, cfg.Storage, lockName); err != nil {
		return cert, false, fmt.Errorf("unable to obtain ARI lock: %v", err)
	}
	defer func() {
		if err := releaseLock(ctx, cfg.Storage, lockName); err != nil {
			logger.Error("unable to release ARI lock", zap.Error(err))
		}
	}()

	// see if the stored value has been refreshed already by another instance
	gotNewARI, newARI, err := cfg.storageHasNewerARI(ctx, cert)

	// when we're all done, log if something about the schedule is different
	// ("WARN" level because ARI window changing may be a sign of external trouble
	// and we want to draw their attention to a potential explanation URL)
	defer func() {
		changed = !newARI.SameWindow(oldARI)

		if changed {
			logger.Warn("ARI window or selected renewal time changed",
				zap.Time("prev_start", oldARI.SuggestedWindow.Start),
				zap.Time("next_start", newARI.SuggestedWindow.Start),
				zap.Time("prev_end", oldARI.SuggestedWindow.End),
				zap.Time("next_end", newARI.SuggestedWindow.End),
				zap.Time("prev_selected_time", oldARI.SelectedTime),
				zap.Time("next_selected_time", newARI.SelectedTime),
				zap.String("explanation_url", newARI.ExplanationURL))
		}
	}()

	if err == nil && gotNewARI {
		// great, storage has a newer one we can use
		cfg.certCache.mu.Lock()
		updatedCert = cfg.certCache.cache[cert.hash]
		updatedCert.ari = newARI
		cfg.certCache.cache[cert.hash] = updatedCert
		cfg.certCache.mu.Unlock()
		logger.Info("reloaded ARI with newer one in storage",
			zap.Timep("next_refresh", newARI.RetryAfter),
			zap.Time("renewal_time", newARI.SelectedTime))
		return
	}

	if err != nil {
		logger.Error("error while checking storage for updated ARI; updating ARI now", zap.Error(err))
	}

	// of the issuers configured, hopefully one of them is the ACME CA we got the cert from
	for _, iss := range cfg.Issuers {
		if ariGetter, ok := iss.(RenewalInfoGetter); ok {
			newARI, err = ariGetter.GetRenewalInfo(ctx, cert) // be sure to use existing newARI variable so we can compare against old value in the defer
			if err != nil {
				// could be anything, but a common error might simply be the "wrong" ACME CA
				// (meaning, different from the one that issued the cert, thus the only one
				// that would have any ARI for it) if multiple ACME CAs are configured
				logger.Error("failed updating renewal info from ACME CA",
					zap.String("issuer", iss.IssuerKey()),
					zap.Error(err))
				continue
			}

			// when we get the latest ARI, the acme package will select a time within the window
			// for us; of course, since it's random, it's likely different from the previously-
			// selected time; but if the window doesn't change, there's no need to change the
			// selected time (the acme package doesn't know the previous window to know better)
			// ... so if the window hasn't changed we'll just put back the selected time
			if newARI.SameWindow(oldARI) && !oldARI.SelectedTime.IsZero() {
				newARI.SelectedTime = oldARI.SelectedTime
			}

			// then store the updated ARI (even if the window didn't change, the Retry-After
			// likely did) in cache and storage

			// be sure we get the cert from the cache while inside a lock to avoid logical races
			cfg.certCache.mu.Lock()
			updatedCert = cfg.certCache.cache[cert.hash]
			updatedCert.ari = newARI
			cfg.certCache.cache[cert.hash] = updatedCert
			cfg.certCache.mu.Unlock()

			// update the ARI value in storage
			var certData acme.Certificate
			certData, err = cfg.loadStoredACMECertificateMetadata(ctx, cert)
			if err != nil {
				err = fmt.Errorf("got new ARI from %s, but failed loading stored certificate metadata: %v", iss.IssuerKey(), err)
				return
			}
			certData.RenewalInfo = &newARI
			var certDataBytes, certResBytes []byte
			certDataBytes, err = json.Marshal(certData)
			if err != nil {
				err = fmt.Errorf("got new ARI from %s, but failed marshaling certificate ACME metadata: %v", iss.IssuerKey(), err)
				return
			}
			certResBytes, err = json.MarshalIndent(CertificateResource{
				SANs:       cert.Names,
				IssuerData: certDataBytes,
			}, "", "\t")
			if err != nil {
				err = fmt.Errorf("got new ARI from %s, but could not re-encode certificate metadata: %v", iss.IssuerKey(), err)
				return
			}
			if err = cfg.Storage.Store(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0]), certResBytes); err != nil {
				err = fmt.Errorf("got new ARI from %s, but could not store it with certificate metadata: %v", iss.IssuerKey(), err)
				return
			}

			logger.Info("updated ACME renewal information",
				zap.Time("selected_time", newARI.SelectedTime),
				zap.Timep("next_update", newARI.RetryAfter),
				zap.String("explanation_url", newARI.ExplanationURL))

			return
		}
	}

	err = fmt.Errorf("could not fully update ACME renewal info: either no issuer supporting ARI is configured for certificate, or all such failed (make sure the ACME CA that issued the certificate is configured)")
	return
}

// CleanStorageOptions specifies how to clean up a storage unit.
type CleanStorageOptions struct {
	// Optional custom logger.
	Logger *zap.Logger

	// Optional ID of the instance initiating the cleaning.
	InstanceID string

	// If set, cleaning will be skipped if it was performed
	// more recently than this interval.
	Interval time.Duration

	// Whether to clean cached OCSP staples.
	OCSPStaples bool

	// Whether to cleanup expired certificates, and if so,
	// how long to let them stay after they've expired.
	ExpiredCerts           bool
	ExpiredCertGracePeriod time.Duration
}

// CleanStorage removes assets which are no longer useful,
// according to opts.
func CleanStorage(ctx context.Context, storage Storage, opts CleanStorageOptions) error {
	const (
		lockName   = "storage_clean"
		storageKey = "last_clean.json"
	)

	if opts.Logger == nil {
		opts.Logger = defaultLogger.Named("clean_storage")
	}
	opts.Logger = opts.Logger.With(zap.Any("storage", storage))

	// storage cleaning should be globally exclusive
	if err := acquireLock(ctx, storage, lockName); err != nil {
		return fmt.Errorf("unable to acquire %s lock: %v", lockName, err)
	}
	defer func() {
		if err := releaseLock(ctx, storage, lockName); err != nil {
			opts.Logger.Error("unable to release lock", zap.Error(err))
			return
		}
	}()

	// cleaning should not happen more often than the interval
	if opts.Interval > 0 {
		lastCleanBytes, err := storage.Load(ctx, storageKey)
		if !errors.Is(err, fs.ErrNotExist) {
			if err != nil {
				return fmt.Errorf("loading last clean timestamp: %v", err)
			}

			var lastClean lastCleanPayload
			err = json.Unmarshal(lastCleanBytes, &lastClean)
			if err != nil {
				return fmt.Errorf("decoding last clean data: %v", err)
			}

			lastTLSClean := lastClean["tls"]
			if time.Since(lastTLSClean.Timestamp) < opts.Interval {
				nextTime := time.Now().Add(opts.Interval)
				opts.Logger.Info("storage cleaning happened too recently; skipping for now",
					zap.String("instance", lastTLSClean.InstanceID),
					zap.Time("try_again", nextTime),
					zap.Duration("try_again_in", time.Until(nextTime)),
				)
				return nil
			}
		}
	}

	opts.Logger.Info("cleaning storage unit")

	if opts.OCSPStaples {
		err := deleteOldOCSPStaples(ctx, storage, opts.Logger)
		if err != nil {
			opts.Logger.Error("deleting old OCSP staples", zap.Error(err))
		}
	}
	if opts.ExpiredCerts {
		err := deleteExpiredCerts(ctx, storage, opts.Logger, opts.ExpiredCertGracePeriod)
		if err != nil {
			opts.Logger.Error("deleting expired certificates staples", zap.Error(err))
		}
	}
	// TODO: delete stale locks?

	// update the last-clean time
	lastCleanBytes, err := json.Marshal(lastCleanPayload{
		"tls": lastCleaned{
			Timestamp:  time.Now(),
			InstanceID: opts.InstanceID,
		},
	})
	if err != nil {
		return fmt.Errorf("encoding last cleaned info: %v", err)
	}
	if err := storage.Store(ctx, storageKey, lastCleanBytes); err != nil {
		return fmt.Errorf("storing last clean info: %v", err)
	}

	return nil
}

type lastCleanPayload map[string]lastCleaned

type lastCleaned struct {
	Timestamp  time.Time `json:"timestamp"`
	InstanceID string    `json:"instance_id,omitempty"`
}

func deleteOldOCSPStaples(ctx context.Context, storage Storage, logger *zap.Logger) error {
	ocspKeys, err := storage.List(ctx, prefixOCSP, false)
	if err != nil {
		// maybe just hasn't been created yet; no big deal
		return nil
	}
	for _, key := range ocspKeys {
		// if context was cancelled, quit early; otherwise proceed
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}
		ocspBytes, err := storage.Load(ctx, key)
		if err != nil {
			logger.Error("while deleting old OCSP staples, unable to load staple file", zap.Error(err))
			continue
		}
		resp, err := ocsp.ParseResponse(ocspBytes, nil)
		if err != nil {
			// contents are invalid; delete it
			err = storage.Delete(ctx, key)
			if err != nil {
				logger.Error("purging corrupt staple file", zap.String("storage_key", key), zap.Error(err))
			}
			continue
		}
		if time.Now().After(resp.NextUpdate) {
			// response has expired; delete it
			err = storage.Delete(ctx, key)
			if err != nil {
				logger.Error("purging expired staple file", zap.String("storage_key", key), zap.Error(err))
			}
		}
	}
	return nil
}

func deleteExpiredCerts(ctx context.Context, storage Storage, logger *zap.Logger, gracePeriod time.Duration) error {
	issuerKeys, err := storage.List(ctx, prefixCerts, false)
	if err != nil {
		// maybe just hasn't been created yet; no big deal
		return nil
	}

	for _, issuerKey := range issuerKeys {
		siteKeys, err := storage.List(ctx, issuerKey, false)
		if err != nil {
			logger.Error("listing contents", zap.String("issuer_key", issuerKey), zap.Error(err))
			continue
		}

		for _, siteKey := range siteKeys {
			// if context was cancelled, quit early; otherwise proceed
			select {
			case <-ctx.Done():
				return ctx.Err()
			default:
			}

			siteAssets, err := storage.List(ctx, siteKey, false)
			if err != nil {
				logger.Error("listing site contents", zap.String("site_key", siteKey), zap.Error(err))
				continue
			}

			for _, assetKey := range siteAssets {
				if path.Ext(assetKey) != ".crt" {
					continue
				}

				certFile, err := storage.Load(ctx, assetKey)
				if err != nil {
					return fmt.Errorf("loading certificate file %s: %v", assetKey, err)
				}
				block, _ := pem.Decode(certFile)
				if block == nil || block.Type != "CERTIFICATE" {
					return fmt.Errorf("certificate file %s does not contain PEM-encoded certificate", assetKey)
				}
				cert, err := x509.ParseCertificate(block.Bytes)
				if err != nil {
					return fmt.Errorf("certificate file %s is malformed; error parsing PEM: %v", assetKey, err)
				}

				if expiredTime := time.Since(expiresAt(cert)); expiredTime >= gracePeriod {
					logger.Info("certificate expired beyond grace period; cleaning up",
						zap.String("asset_key", assetKey),
						zap.Duration("expired_for", expiredTime),
						zap.Duration("grace_period", gracePeriod))
					baseName := strings.TrimSuffix(assetKey, ".crt")
					for _, relatedAsset := range []string{
						assetKey,
						baseName + ".key",
						baseName + ".json",
					} {
						logger.Info("deleting asset because resource expired", zap.String("asset_key", relatedAsset))
						err := storage.Delete(ctx, relatedAsset)
						if err != nil {
							logger.Error("could not clean up asset related to expired certificate",
								zap.String("base_name", baseName),
								zap.String("related_asset", relatedAsset),
								zap.Error(err))
						}
					}
				}
			}

			// update listing; if folder is empty, delete it
			siteAssets, err = storage.List(ctx, siteKey, false)
			if err != nil {
				continue
			}
			if len(siteAssets) == 0 {
				logger.Info("deleting site folder because key is empty", zap.String("site_key", siteKey))
				err := storage.Delete(ctx, siteKey)
				if err != nil {
					return fmt.Errorf("deleting empty site folder %s: %v", siteKey, err)
				}
			}
		}
	}
	return nil
}

// forceRenew forcefully renews cert and replaces it in the cache, and returns the new certificate. It is intended
// for use primarily in the case of cert revocation. This MUST NOT be called within a lock on cfg.certCacheMu.
func (cfg *Config) forceRenew(ctx context.Context, logger *zap.Logger, cert Certificate) (Certificate, error) {
	if cert.ocsp != nil && cert.ocsp.Status == ocsp.Revoked {
		logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate",
			zap.Strings("identifiers", cert.Names),
			zap.Time("expiration", expiresAt(cert.Leaf)))
	} else {
		logger.Warn("forcefully renewing certificate",
			zap.Strings("identifiers", cert.Names),
			zap.Time("expiration", expiresAt(cert.Leaf)))
	}

	renewName := cert.Names[0]

	// if revoked for key compromise, we can't be sure whether the storage of
	// the key is still safe; however, we KNOW the old key is not safe, and we
	// can only hope by the time of revocation that storage has been secured;
	// key management is not something we want to get into, but in this case
	// it seems prudent to replace the key - and since renewal requires reuse
	// of a prior key, we can't do a "renew" to replace the cert if we need a
	// new key, so we'll have to do an obtain instead
	var obtainInsteadOfRenew bool
	if cert.ocsp != nil && cert.ocsp.RevocationReason == acme.ReasonKeyCompromise {
		err := cfg.moveCompromisedPrivateKey(ctx, cert, logger)
		if err != nil {
			logger.Error("could not remove compromised private key from use",
				zap.Strings("identifiers", cert.Names),
				zap.String("issuer", cert.issuerKey),
				zap.Error(err))
		}
		obtainInsteadOfRenew = true
	}

	var err error
	if obtainInsteadOfRenew {
		err = cfg.ObtainCertAsync(ctx, renewName)
	} else {
		// notice that we force renewal; otherwise, it might see that the
		// certificate isn't close to expiring and return, but we really
		// need a replacement certificate! see issue #4191
		err = cfg.RenewCertAsync(ctx, renewName, true)
	}
	if err != nil {
		if cert.ocsp != nil && cert.ocsp.Status == ocsp.Revoked {
			// probably better to not serve a revoked certificate at all
			logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache",
				zap.Strings("identifiers", cert.Names),
				zap.Error(err))
			cfg.certCache.mu.Lock()
			cfg.certCache.removeCertificate(cert)
			cfg.certCache.mu.Unlock()
		}
		return cert, fmt.Errorf("unable to forcefully get new certificate for %v: %w", cert.Names, err)
	}

	return cfg.reloadManagedCertificate(ctx, cert)
}

// moveCompromisedPrivateKey moves the private key for cert to a ".compromised" file
// by copying the data to the new file, then deleting the old one.
func (cfg *Config) moveCompromisedPrivateKey(ctx context.Context, cert Certificate, logger *zap.Logger) error {
	privKeyStorageKey := StorageKeys.SitePrivateKey(cert.issuerKey, cert.Names[0])

	privKeyPEM, err := cfg.Storage.Load(ctx, privKeyStorageKey)
	if err != nil {
		return err
	}

	compromisedPrivKeyStorageKey := privKeyStorageKey + ".compromised"
	err = cfg.Storage.Store(ctx, compromisedPrivKeyStorageKey, privKeyPEM)
	if err != nil {
		// better safe than sorry: as a last resort, try deleting the key so it won't be reused
		cfg.Storage.Delete(ctx, privKeyStorageKey)
		return err
	}

	err = cfg.Storage.Delete(ctx, privKeyStorageKey)
	if err != nil {
		return err
	}

	logger.Info("removed certificate's compromised private key from use",
		zap.String("storage_path", compromisedPrivKeyStorageKey),
		zap.Strings("identifiers", cert.Names),
		zap.String("issuer", cert.issuerKey))

	return nil
}

// certShouldBeForceRenewed returns true if cert should be forcefully renewed
// (like if it is revoked according to its OCSP response).
func certShouldBeForceRenewed(cert Certificate) bool {
	return cert.managed &&
		len(cert.Names) > 0 &&
		cert.ocsp != nil &&
		cert.ocsp.Status == ocsp.Revoked
}

type certList []Certificate

// insert appends cert to the list if it is not already in the list.
// Efficiency: O(n)
func (certs *certList) insert(cert Certificate) {
	for _, c := range *certs {
		if c.hash == cert.hash {
			return
		}
	}
	*certs = append(*certs, cert)
}

const (
	// DefaultRenewCheckInterval is how often to check certificates for expiration.
	// Scans are very lightweight, so this can be semi-frequent. This default should
	// be smaller than <Minimum Cert Lifetime>*DefaultRenewalWindowRatio/3, which
	// gives certificates plenty of chance to be renewed on time.
	DefaultRenewCheckInterval = 10 * time.Minute

	// DefaultRenewalWindowRatio is how much of a certificate's lifetime becomes the
	// renewal window. The renewal window is the span of time at the end of the
	// certificate's validity period in which it should be renewed. A default value
	// of ~1/3 is pretty safe and recommended for most certificates.
	DefaultRenewalWindowRatio = 1.0 / 3.0

	// DefaultOCSPCheckInterval is how often to check if OCSP stapling needs updating.
	DefaultOCSPCheckInterval = 1 * time.Hour
)