Initial implementation of ARI (#286)

* Initial implementation of ARI

* Enhance redundancy, robustness, and logging

* Improve ARI updating; integrate on-demand TLS; detect changed window
This commit is contained in:
Matt Holt 2024-05-07 09:46:03 -06:00 committed by GitHub
parent fa7161a1a8
commit 0e88b3eaa1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 528 additions and 100 deletions

View File

@ -137,44 +137,21 @@ func (iss *ACMEIssuer) newACMEClientWithAccount(ctx context.Context, useTestCA,
// independent of any particular ACME account. If useTestCA is true, am.TestCA
// will be used if it is set; otherwise, the primary CA will be used.
func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) {
// ensure defaults are filled in
var caURL string
if useTestCA {
caURL = iss.TestCA
client, err := iss.newBasicACMEClient()
if err != nil {
return nil, err
}
if caURL == "" {
caURL = iss.CA
}
if caURL == "" {
caURL = DefaultACME.CA
// fill in a little more beyond a basic client
if useTestCA && iss.TestCA != "" {
client.Client.Directory = iss.TestCA
}
certObtainTimeout := iss.CertObtainTimeout
if certObtainTimeout == 0 {
certObtainTimeout = DefaultACME.CertObtainTimeout
}
// ensure endpoint is secure (assume HTTPS if scheme is missing)
if !strings.Contains(caURL, "://") {
caURL = "https://" + caURL
}
u, err := url.Parse(caURL)
if err != nil {
return nil, err
}
if u.Scheme != "https" && !SubjectIsInternal(u.Host) {
return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL)
}
client := &acmez.Client{
Client: &acme.Client{
Directory: caURL,
PollTimeout: certObtainTimeout,
UserAgent: buildUAString(),
HTTPClient: iss.httpClient,
},
ChallengeSolvers: make(map[string]acmez.Solver),
}
client.Logger = iss.Logger.Named("acme_client")
client.Client.PollTimeout = certObtainTimeout
client.ChallengeSolvers = make(map[string]acmez.Solver)
// configure challenges (most of the time, DNS challenge is
// exclusive of other ones because it is usually only used
@ -230,6 +207,42 @@ func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) {
return client, nil
}
// newBasicACMEClient sets up a basically-functional ACME client that is not capable
// of solving challenges but can provide basic interactions with the server.
func (iss *ACMEIssuer) newBasicACMEClient() (*acmez.Client, error) {
caURL := iss.CA
if caURL == "" {
caURL = DefaultACME.CA
}
// ensure endpoint is secure (assume HTTPS if scheme is missing)
if !strings.Contains(caURL, "://") {
caURL = "https://" + caURL
}
u, err := url.Parse(caURL)
if err != nil {
return nil, err
}
if u.Scheme != "https" && !SubjectIsInternal(u.Host) {
return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL)
}
return &acmez.Client{
Client: &acme.Client{
Directory: caURL,
UserAgent: buildUAString(),
HTTPClient: iss.httpClient,
Logger: iss.Logger.Named("acme_client"),
},
}, nil
}
func (iss *ACMEIssuer) getRenewalInfo(ctx context.Context, cert Certificate) (acme.RenewalInfo, error) {
acmeClient, err := iss.newBasicACMEClient()
if err != nil {
return acme.RenewalInfo{}, err
}
return acmeClient.GetRenewalInfo(ctx, cert.Certificate.Leaf)
}
func (iss *ACMEIssuer) getHTTPPort() int {
useHTTPPort := HTTPChallengePort
if HTTPPort > 0 && HTTPPort != HTTPChallengePort {

View File

@ -362,12 +362,13 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
panic("missing config pointer (must use NewACMEIssuer)")
}
var isRetry bool
if attempts, ok := ctx.Value(AttemptsCtxKey).(*int); ok {
isRetry = *attempts > 0
var attempts int
if attemptsPtr, ok := ctx.Value(AttemptsCtxKey).(*int); ok {
attempts = *attemptsPtr
}
isRetry := attempts > 0
cert, usedTestCA, err := am.doIssue(ctx, csr, isRetry)
cert, usedTestCA, err := am.doIssue(ctx, csr, attempts)
if err != nil {
return nil, err
}
@ -395,7 +396,7 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
// other endpoint. This is more likely to happen if a user is testing with
// the staging CA as the main CA, then changes their configuration once they
// think they are ready for the production endpoint.
cert, _, err = am.doIssue(ctx, csr, false)
cert, _, err = am.doIssue(ctx, csr, 0)
if err != nil {
// succeeded with test CA but failed just now with the production CA;
// either we are observing differing internal states of each CA that will
@ -423,7 +424,8 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
return cert, err
}
func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, useTestCA bool) (*IssuedCertificate, bool, error) {
func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, attempts int) (*IssuedCertificate, bool, error) {
useTestCA := attempts > 0
client, err := am.newACMEClientWithAccount(ctx, useTestCA, false)
if err != nil {
return nil, false, err
@ -449,6 +451,22 @@ func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest,
params.NotAfter = time.Now().Add(am.NotAfter)
}
// Notify the ACME server we are replacing a certificate (if the caller says we are),
// only if the following conditions are met:
// - The caller has set a Replaces value in the context, indicating this is a renewal.
// - Not using test CA. This should be obvious, but a test CA should be in a separate
// environment from production, and thus not have knowledge of the cert being replaced.
// - Not a certain attempt number. We skip setting Replaces once early on in the retries
// in case the reason the order is failing is only because there is a state inconsistency
// between client and server or some sort of bookkeeping error with regards to the certID
// and the server is rejecting the ARI certID. In any case, an invalid certID may cause
// orders to fail. So try once without setting it.
if !usingTestCA && attempts != 2 {
if replacing, ok := ctx.Value(ctxKeyARIReplaces).(*x509.Certificate); ok {
params.Replaces = replacing
}
}
// do this in a loop because there's an error case that may necessitate a retry, but not more than once
var certChains []acme.Certificate
for i := 0; i < 2; i++ {
@ -631,6 +649,10 @@ const (
// prefixACME is the storage key prefix used for ACME-specific assets.
const prefixACME = "acme"
type ctxKey string
const ctxKeyARIReplaces = ctxKey("ari_replaces")
// Interface guards
var (
_ PreChecker = (*ACMEIssuer)(nil)

View File

@ -18,12 +18,15 @@ import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/json"
"fmt"
"math/rand"
"net"
"os"
"strings"
"time"
"github.com/mholt/acmez/v2/acme"
"go.uber.org/zap"
"golang.org/x/crypto/ocsp"
)
@ -56,6 +59,9 @@ type Certificate struct {
// The unique string identifying the issuer of this certificate.
issuerKey string
// ACME Renewal Information, if available
ari acme.RenewalInfo
}
// Empty returns true if the certificate struct is not filled out; at
@ -67,10 +73,106 @@ func (cert Certificate) Empty() bool {
// Hash returns a checksum of the certificate chain's DER-encoded bytes.
func (cert Certificate) Hash() string { return cert.hash }
// NeedsRenewal returns true if the certificate is
// expiring soon (according to cfg) or has expired.
// NeedsRenewal returns true if the certificate is expiring
// soon (according to ARI and/or cfg) or has expired.
func (cert Certificate) NeedsRenewal(cfg *Config) bool {
return currentlyInRenewalWindow(cert.Leaf.NotBefore, expiresAt(cert.Leaf), cfg.RenewalWindowRatio)
return cfg.certNeedsRenewal(cert.Leaf, cert.ari, true)
}
// certNeedsRenewal consults ACME Renewal Info (ARI) and certificate expiration to determine
// whether the leaf certificate needs to be renewed yet. If true is returned, the certificate
// should be renewed as soon as possible. The reasoning for a true return value is logged
// unless emitLogs is false; this can be useful to suppress noisy logs in the case where you
// first call this to determine if a cert in memory needs renewal, and then right after you
// call it again to see if the cert in storage still needs renewal -- you probably don't want
// to log the second time for checking the cert in storage which is mainly for synchronization.
func (cfg *Config) certNeedsRenewal(leaf *x509.Certificate, ari acme.RenewalInfo, emitLogs bool) bool {
expiration := expiresAt(leaf)
var logger *zap.Logger
if emitLogs {
logger = cfg.Logger.With(
zap.Strings("subjects", leaf.DNSNames),
zap.Time("expiration", expiration),
zap.String("ari_cert_id", ari.UniqueIdentifier),
zap.Timep("next_ari_update", ari.RetryAfter),
zap.Duration("renew_check_interval", cfg.certCache.options.RenewCheckInterval),
zap.Time("window_start", ari.SuggestedWindow.Start),
zap.Time("window_end", ari.SuggestedWindow.End))
} else {
logger = zap.NewNop()
}
// first check ARI: if it says it's time to renew, it's time to renew
// (notice that we don't strictly require an ARI window to also exist; we presume
// that if a time has been selected, a window does or did exist, even if it didn't
// get stored/encoded for some reason - but also: this allows administrators to
// manually or explicitly schedule a renewal time indepedently of ARI which could
// be useful)
selectedTime := ari.SelectedTime
// if, for some reason a random time in the window hasn't been selected yet, but an ARI
// window does exist, we can always improvise one... even if this is called repeatedly,
// a random time is a random time, whether you generate it once or more :D
// (code borrowed from our acme package)
if selectedTime.IsZero() &&
(!ari.SuggestedWindow.Start.IsZero() && !ari.SuggestedWindow.End.IsZero()) {
start, end := ari.SuggestedWindow.Start.Unix()+1, ari.SuggestedWindow.End.Unix()
selectedTime = time.Unix(rand.Int63n(end-start)+start, 0).UTC()
logger.Warn("no renewal time had been selected with ARI; chose an ephemeral one for now",
zap.Time("ephemeral_selected_time", selectedTime))
}
// if a renewal time has been selected, start with that
if !selectedTime.IsZero() {
// ARI spec recommends an algorithm that renews after the randomly-selected
// time OR just before it if the next waking time would be after it; this
// cutoff can actually be before the start of the renewal window, but the spec
// author says that's OK: https://github.com/aarongable/draft-acme-ari/issues/71
cutoff := ari.SelectedTime.Add(-cfg.certCache.options.RenewCheckInterval)
if time.Now().After(cutoff) {
logger.Info("certificate needs renewal based on ARI window",
zap.Time("selected_time", selectedTime),
zap.Time("renewal_cutoff", cutoff))
return true
}
// according to ARI, we are not ready to renew; however, we do not rely solely on
// ARI calculations... what if there is a bug in our implementation, or in the
// server's, or the stored metadata? for redundancy, give credence to the expiration
// date; ignore ARI if we are past a "dangerously close" limit, to avoid any
// possibility of a bug in ARI compromising a site's uptime: we should always always
// always give heed to actual validity period
if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/20.0) {
logger.Warn("certificate is in emergency renewal window; superceding ARI",
zap.Duration("remaining", time.Until(expiration)),
zap.Time("renewal_cutoff", cutoff))
return true
}
}
// the normal check, in the absence of ARI, is to determine if we're near enough (or past)
// the expiration date based on the configured remaining:lifetime ratio
if currentlyInRenewalWindow(leaf.NotBefore, expiration, cfg.RenewalWindowRatio) {
logger.Info("certificate is in configured renewal window based on expiration date",
zap.Duration("remaining", time.Until(expiration)))
return true
}
// finally, if the certificate is expiring imminently, always attempt a renewal;
// we check both a (very low) lifetime ratio and also a strict difference between
// the time until expiration and the interval at which we run the standard maintenance
// routine to check for renewals, to accommodate both exceptionally long and short
// cert lifetimes
if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/50.0) ||
time.Until(expiration) < cfg.certCache.options.RenewCheckInterval*5 {
logger.Warn("certificate is in emergency renewal window; expiration imminent",
zap.Duration("remaining", time.Until(expiration)))
return true
}
return false
}
// Expired returns true if the certificate has expired.
@ -85,10 +187,12 @@ func (cert Certificate) Expired() bool {
return time.Now().After(expiresAt(cert.Leaf))
}
// currentlyInRenewalWindow returns true if the current time is
// within the renewal window, according to the given start/end
// currentlyInRenewalWindow returns true if the current time is within
// (or after) the renewal window, according to the given start/end
// dates and the ratio of the renewal window. If true is returned,
// the certificate being considered is due for renewal.
// the certificate being considered is due for renewal. The ratio
// is remaining:total time, i.e. 1/3 = 1/3 of lifetime remaining,
// or 9/10 = 9/10 of time lifetime remaining.
func currentlyInRenewalWindow(notBefore, notAfter time.Time, renewalWindowRatio float64) bool {
if notAfter.IsZero() {
return false
@ -154,9 +258,37 @@ func (cfg *Config) loadManagedCertificate(ctx context.Context, domain string) (C
}
cert.managed = true
cert.issuerKey = certRes.issuerKey
if ari, err := certRes.getARI(); err == nil && ari != nil {
cert.ari = *ari
}
return cert, nil
}
// getARI unpacks ACME Renewal Information from the issuer data, if available.
// It is only an error if there is invalid JSON.
func (certRes CertificateResource) getARI() (*acme.RenewalInfo, error) {
acmeData, err := certRes.getACMEData()
if err != nil {
return nil, err
}
return acmeData.RenewalInfo, nil
}
// getACMEData returns the ACME certificate metadata from the IssuerData, but
// note that a non-ACME-issued certificate may return an empty value and nil
// since the JSON may still decode successfully but just not match any or all
// of the fields. Remember that the IssuerKey is used to store and access the
// cert files in the first place (it is part of the path) so in theory if you
// load a CertificateResource from an ACME issuer it should work as expected.
func (certRes CertificateResource) getACMEData() (acme.Certificate, error) {
if len(certRes.IssuerData) == 0 {
return acme.Certificate{}, nil
}
var acmeCert acme.Certificate
err := json.Unmarshal(certRes.IssuerData, &acmeCert)
return acmeCert, err
}
// CacheUnmanagedCertificatePEMFile loads a certificate for host using certFile
// and keyFile, which must be in PEM format. It stores the certificate in
// the in-memory cache and returns the hash, useful for removing from the cache.
@ -329,21 +461,22 @@ func fillCertFromLeaf(cert *Certificate, tlsCert tls.Certificate) error {
return nil
}
// managedCertInStorageExpiresSoon returns true if cert (being a
// managed certificate) is expiring within RenewDurationBefore.
// It returns false if there was an error checking the expiration
// of the certificate as found in storage, or if the certificate
// in storage is NOT expiring soon. A certificate that is expiring
// managedCertInStorageNeedsRenewal returns true if cert (being a
// managed certificate) is expiring soon (according to cfg) or if
// ACME Renewal Information (ARI) is available and says that it is
// time to renew (it uses existing ARI; it does not update it).
// It returns false if there was an error, the cert is not expiring
// soon, and ARI window is still future. A certificate that is expiring
// soon in our cache but is not expiring soon in storage probably
// means that another instance renewed the certificate in the
// meantime, and it would be a good idea to simply load the cert
// into our cache rather than repeating the renewal process again.
func (cfg *Config) managedCertInStorageExpiresSoon(ctx context.Context, cert Certificate) (bool, error) {
func (cfg *Config) managedCertInStorageNeedsRenewal(ctx context.Context, cert Certificate) (bool, error) {
certRes, err := cfg.loadCertResourceAnyIssuer(ctx, cert.Names[0])
if err != nil {
return false, err
}
_, needsRenew := cfg.managedCertNeedsRenewal(certRes)
_, _, needsRenew := cfg.managedCertNeedsRenewal(certRes, false)
return needsRenew, nil
}

View File

@ -39,6 +39,7 @@ import (
"crypto"
"crypto/tls"
"crypto/x509"
"encoding/json"
"fmt"
"log"
"net"
@ -388,7 +389,8 @@ type IssuedCertificate struct {
Certificate []byte
// Any extra information to serialize alongside the
// certificate in storage.
// certificate in storage. It MUST be serializable
// as JSON in order to be preserved.
Metadata any
}
@ -409,7 +411,7 @@ type CertificateResource struct {
// Any extra information associated with the certificate,
// usually provided by the issuer implementation.
IssuerData any `json:"issuer_data,omitempty"`
IssuerData json.RawMessage `json:"issuer_data,omitempty"`
// The unique string identifying the issuer of the
// certificate; internally useful for storage access.

View File

@ -52,6 +52,7 @@ type Config struct {
// it should be renewed; for most certificates, the
// global default is good, but for extremely short-
// lived certs, you may want to raise this to ~0.5.
// Ratio is remaining:total lifetime.
RenewalWindowRatio float64
// An optional event callback clients can set
@ -446,6 +447,15 @@ func (cfg *Config) manageOne(ctx context.Context, domainName string, async bool)
return err
}
// ensure ARI is updated before we check whether the cert needs renewing
// (we ignore the second return value because we already check if needs renewing anyway)
if cert.ari.NeedsRefresh() {
cert, _, err = cfg.updateARI(ctx, cert, cfg.Logger)
if err != nil {
cfg.Logger.Error("updating ARI upon managing", zap.Error(err))
}
}
// otherwise, simply renew the certificate if needed
if cert.NeedsRenewal(cfg) {
var err error
@ -639,11 +649,15 @@ func (cfg *Config) obtainCert(ctx context.Context, name string, interactive bool
issuerKey := issuerUsed.IssuerKey()
// success - immediately save the certificate resource
metaJSON, err := json.Marshal(issuedCert.Metadata)
if err != nil {
log.Error("unable to encode certificate metadata", zap.Error(err))
}
certRes := CertificateResource{
SANs: namesFromCSR(csr),
CertificatePEM: issuedCert.Certificate,
PrivateKeyPEM: privKeyPEM,
IssuerData: issuedCert.Metadata,
IssuerData: metaJSON,
issuerKey: issuerUsed.IssuerKey(),
}
err = cfg.saveCertResource(ctx, issuerUsed, certRes)
@ -792,7 +806,7 @@ func (cfg *Config) renewCert(ctx context.Context, name string, force, interactiv
}
// check if renew is still needed - might have been renewed while waiting for lock
timeLeft, needsRenew := cfg.managedCertNeedsRenewal(certRes)
timeLeft, leaf, needsRenew := cfg.managedCertNeedsRenewal(certRes, false)
if !needsRenew {
if force {
log.Info("certificate does not need to be renewed, but renewal is being forced",
@ -869,6 +883,18 @@ func (cfg *Config) renewCert(ctx context.Context, name string, force, interactiv
}
}
// if we're renewing with the same ACME CA as before, have the ACME
// client tell the server we are replacing a certificate (but doing
// this on the wrong CA, or when the CA doesn't recognize the certID,
// can fail the order)
if acmeData, err := certRes.getACMEData(); err == nil && acmeData.CA != "" {
if acmeIss, ok := issuer.(*ACMEIssuer); ok {
if acmeIss.CA == acmeData.CA {
ctx = context.WithValue(ctx, ctxKeyARIReplaces, leaf)
}
}
}
issuedCert, err = issuer.Issue(ctx, useCSR)
if err == nil {
issuerUsed = issuer
@ -902,11 +928,15 @@ func (cfg *Config) renewCert(ctx context.Context, name string, force, interactiv
issuerKey := issuerUsed.IssuerKey()
// success - immediately save the renewed certificate resource
metaJSON, err := json.Marshal(issuedCert.Metadata)
if err != nil {
log.Error("unable to encode certificate metadata", zap.Error(err))
}
newCertRes := CertificateResource{
SANs: namesFromCSR(csr),
CertificatePEM: issuedCert.Certificate,
PrivateKeyPEM: certRes.PrivateKeyPEM,
IssuerData: issuedCert.Metadata,
IssuerData: metaJSON,
issuerKey: issuerKey,
}
err = cfg.saveCertResource(ctx, issuerUsed, newCertRes)
@ -1206,14 +1236,19 @@ func (cfg *Config) lockKey(op, domainName string) string {
// managedCertNeedsRenewal returns true if certRes is expiring soon or already expired,
// or if the process of decoding the cert and checking its expiration returned an error.
func (cfg *Config) managedCertNeedsRenewal(certRes CertificateResource) (time.Duration, bool) {
// If there wasn't an error, the leaf cert is also returned, so it can be reused if
// necessary, since we are parsing the PEM bundle anyway.
func (cfg *Config) managedCertNeedsRenewal(certRes CertificateResource, emitLogs bool) (time.Duration, *x509.Certificate, bool) {
certChain, err := parseCertsFromPEMBundle(certRes.CertificatePEM)
if err != nil {
return 0, true
if err != nil || len(certChain) == 0 {
return 0, nil, true
}
var ari acme.RenewalInfo
if ariPtr, err := certRes.getARI(); err == nil && ariPtr != nil {
ari = *ariPtr
}
remaining := time.Until(expiresAt(certChain[0]))
needsRenew := currentlyInRenewalWindow(certChain[0].NotBefore, expiresAt(certChain[0]), cfg.RenewalWindowRatio)
return remaining, needsRenew
return remaining, certChain[0], cfg.certNeedsRenewal(certChain[0], ari, emitLogs)
}
func (cfg *Config) emit(ctx context.Context, eventName string, data map[string]any) error {

View File

@ -15,7 +15,9 @@
package certmagic
import (
"bytes"
"context"
"encoding/json"
"os"
"reflect"
"testing"
@ -51,9 +53,9 @@ func TestSaveCertResource(t *testing.T) {
SANs: []string{domain},
PrivateKeyPEM: []byte(keyContents),
CertificatePEM: []byte(certContents),
IssuerData: &acme.Certificate{
IssuerData: mustJSON(acme.Certificate{
URL: "https://example.com/cert",
},
}),
issuerKey: am.IssuerKey(),
}
@ -62,17 +64,22 @@ func TestSaveCertResource(t *testing.T) {
t.Fatalf("Expected no error, got: %v", err)
}
// the result of our test will be a map, since we have
// no choice but to decode it into an 'any' interface
cert.IssuerData = map[string]any{
"url": "https://example.com/cert",
}
siteData, err := testConfig.loadCertResource(ctx, am, domain)
if err != nil {
t.Fatalf("Expected no error reading site, got: %v", err)
}
siteData.IssuerData = bytes.ReplaceAll(siteData.IssuerData, []byte("\t"), []byte(""))
siteData.IssuerData = bytes.ReplaceAll(siteData.IssuerData, []byte("\n"), []byte(""))
siteData.IssuerData = bytes.ReplaceAll(siteData.IssuerData, []byte(" "), []byte(""))
if !reflect.DeepEqual(cert, siteData) {
t.Errorf("Expected '%+v' to match '%+v'", cert, siteData)
t.Errorf("Expected '%+v' to match '%+v'\n%s\n%s", cert.IssuerData, siteData.IssuerData, string(cert.IssuerData), string(siteData.IssuerData))
}
}
func mustJSON(val any) []byte {
result, err := json.Marshal(val)
if err != nil {
panic("marshaling JSON: " + err.Error())
}
return result
}

2
go.mod
View File

@ -8,7 +8,7 @@ require (
github.com/caddyserver/zerossl v0.1.2
github.com/klauspost/cpuid/v2 v2.2.7
github.com/libdns/libdns v0.2.2
github.com/mholt/acmez/v2 v2.0.0
github.com/mholt/acmez/v2 v2.0.1-0.20240506200913-5a16e768dea9
github.com/miekg/dns v1.1.58
github.com/zeebo/blake3 v0.2.3
go.uber.org/zap v1.27.0

4
go.sum
View File

@ -7,8 +7,8 @@ github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuV
github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
github.com/libdns/libdns v0.2.2 h1:O6ws7bAfRPaBsgAYt8MDe2HcNBGC29hkZ9MX2eUSX3s=
github.com/libdns/libdns v0.2.2/go.mod h1:4Bj9+5CQiNMVGf87wjX4CY3HQJypUHRuLvlsfsZqLWQ=
github.com/mholt/acmez/v2 v2.0.0 h1:FsGoEdA5mPhAaGfrwIEM13jnn2hn3jU6Vny+RWypt3o=
github.com/mholt/acmez/v2 v2.0.0/go.mod h1:fX4c9r5jYwMyMsC+7tkYRxHibkOTgta5DIFGoe67e1U=
github.com/mholt/acmez/v2 v2.0.1-0.20240506200913-5a16e768dea9 h1:b8yqT+RDihRjjBRq2FWtnduFHqPlpyrKM4ZH0UatgKM=
github.com/mholt/acmez/v2 v2.0.1-0.20240506200913-5a16e768dea9/go.mod h1:fX4c9r5jYwMyMsC+7tkYRxHibkOTgta5DIFGoe67e1U=
github.com/miekg/dns v1.1.58 h1:ca2Hdkz+cDg/7eNF6V56jjzuZ4aCAE+DbVkILdQWG/4=
github.com/miekg/dns v1.1.58/go.mod h1:Ypv+3b/KadlvW9vJfXOTf300O4UqaHFzFCuHz+rPkBY=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

View File

@ -546,11 +546,11 @@ func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.Cli
//
// This function is safe for use by multiple concurrent goroutines.
func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) {
log := cfg.Logger.Named("on_demand")
logger := cfg.Logger.Named("on_demand")
// Check OCSP staple validity
if cert.ocsp != nil && !freshOCSP(cert.ocsp) {
log.Debug("OCSP response needs refreshing",
logger.Debug("OCSP response needs refreshing",
zap.Strings("identifiers", cert.Names),
zap.Int("ocsp_status", cert.ocsp.Status),
zap.Time("this_update", cert.ocsp.ThisUpdate),
@ -560,12 +560,12 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe
if err != nil {
// An error with OCSP stapling is not the end of the world, and in fact, is
// quite common considering not all certs have issuer URLs that support it.
log.Warn("stapling OCSP",
logger.Warn("stapling OCSP",
zap.String("server_name", hello.ServerName),
zap.Strings("sans", cert.Names),
zap.Error(err))
} else {
log.Debug("successfully stapled new OCSP response",
logger.Debug("successfully stapled new OCSP response",
zap.Strings("identifiers", cert.Names),
zap.Int("ocsp_status", cert.ocsp.Status),
zap.Time("this_update", cert.ocsp.ThisUpdate),
@ -578,10 +578,20 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe
cfg.certCache.mu.Unlock()
}
// Check ARI status
if cert.ari.NeedsRefresh() {
// we ignore the second return value here because we go on to check renewal status below regardless
var err error
cert, _, err = cfg.updateARI(ctx, cert, logger)
if err != nil {
logger.Error("updated ARI", zap.Error(err))
}
}
// We attempt to replace any certificates that were revoked.
// Crucially, this happens OUTSIDE a lock on the certCache.
if certShouldBeForceRenewed(cert) {
log.Warn("on-demand certificate's OCSP status is REVOKED; will try to forcefully renew",
logger.Warn("on-demand certificate's OCSP status is REVOKED; will try to forcefully renew",
zap.Strings("identifiers", cert.Names),
zap.Int("ocsp_status", cert.ocsp.Status),
zap.Time("revoked_at", cert.ocsp.RevokedAt),
@ -591,14 +601,13 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe
}
// Check cert expiration
if currentlyInRenewalWindow(cert.Leaf.NotBefore, expiresAt(cert.Leaf), cfg.RenewalWindowRatio) {
if cfg.certNeedsRenewal(cert.Leaf, cert.ari, true) {
// Check if the certificate still exists on disk. If not, we need to obtain a new one.
// This can happen if the certificate was cleaned up by the storage cleaner, but still
// remains in the in-memory cache.
if !cfg.storageHasCertResourcesAnyIssuer(ctx, cert.Names[0]) {
log.Debug("certificate not found on disk; obtaining new certificate",
logger.Debug("certificate not found on disk; obtaining new certificate",
zap.Strings("identifiers", cert.Names))
return cfg.obtainOnDemandCertificate(ctx, hello)
}
// Otherwise, renew the certificate.
@ -620,7 +629,7 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe
//
// This function is safe for use by multiple concurrent goroutines.
func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) {
log := logWithRemote(cfg.Logger.Named("on_demand"), hello)
logger := logWithRemote(cfg.Logger.Named("on_demand"), hello)
name := cfg.getNameFromClientHello(hello)
timeLeft := time.Until(expiresAt(currentCert.Leaf))
@ -637,7 +646,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
// renewing it, so we might as well serve what we have without blocking, UNLESS
// we're forcing renewal, in which case the current certificate is not usable
if timeLeft > 0 && !revoked {
log.Debug("certificate expires soon but is already being renewed; serving current certificate",
logger.Debug("certificate expires soon but is already being renewed; serving current certificate",
zap.Strings("subjects", currentCert.Names),
zap.Duration("remaining", timeLeft))
return currentCert, nil
@ -646,7 +655,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
// otherwise, we'll have to wait for the renewal to finish so we don't serve
// a revoked or expired certificate
log.Debug("certificate has expired, but is already being renewed; waiting for renewal to complete",
logger.Debug("certificate has expired, but is already being renewed; waiting for renewal to complete",
zap.Strings("subjects", currentCert.Names),
zap.Time("expired", expiresAt(currentCert.Leaf)),
zap.Bool("revoked", revoked))
@ -677,7 +686,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
obtainCertWaitChansMu.Unlock()
}
log = log.With(
logger = logger.With(
zap.String("server_name", name),
zap.Strings("subjects", currentCert.Names),
zap.Time("expiration", expiresAt(currentCert.Leaf)),
@ -698,19 +707,19 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
cfg.certCache.mu.Unlock()
unblockWaiters()
if log != nil {
log.Error("certificate should not be obtained", zap.Error(err))
if logger != nil {
logger.Error("certificate should not be obtained", zap.Error(err))
}
return Certificate{}, err
}
log.Info("attempting certificate renewal")
logger.Info("attempting certificate renewal")
// otherwise, renew with issuer, etc.
var newCert Certificate
if revoked {
newCert, err = cfg.forceRenew(ctx, log, currentCert)
newCert, err = cfg.forceRenew(ctx, logger, currentCert)
} else {
err = cfg.RenewCertAsync(ctx, name, false)
if err == nil {
@ -725,7 +734,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
unblockWaiters()
if err != nil {
log.Error("renewing and reloading certificate", zap.String("server_name", name), zap.Error(err))
logger.Error("renewing and reloading certificate", zap.String("server_name", name), zap.Error(err))
}
return newCert, err

View File

@ -92,7 +92,7 @@ func (certCache *Cache) maintainAssets(panicCount int) {
func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
log := certCache.logger.Named("maintenance")
// configs will hold a map of certificate name to the config
// configs will hold a map of certificate hash to the config
// to use when managing that certificate
configs := make(map[string]*Config)
@ -102,7 +102,7 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
// words, our first iteration through the certificate cache does NOT
// perform any operations--only queues them--so that more fine-grained
// write locks may be obtained during the actual operations.
var renewQueue, reloadQueue, deleteQueue []Certificate
var renewQueue, reloadQueue, deleteQueue, ariQueue certList
certCache.mu.RLock()
for certKey, cert := range certCache.cache {
@ -135,22 +135,28 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
continue
}
// ACME-specific: see if if ACME Renewal Info (ARI) window needs refreshing
if cert.ari.NeedsRefresh() {
configs[cert.hash] = cfg
ariQueue = append(ariQueue, cert)
}
// if time is up or expires soon, we need to try to renew it
if cert.NeedsRenewal(cfg) {
configs[cert.Names[0]] = cfg
configs[cert.hash] = cfg
// see if the certificate in storage has already been renewed, possibly by another
// instance that didn't coordinate with this one; if so, just load it (this
// might happen if another instance already renewed it - kinda sloppy but checking disk
// first is a simple way to possibly drastically reduce rate limit problems)
storedCertExpiring, err := cfg.managedCertInStorageExpiresSoon(ctx, cert)
storedCertNeedsRenew, err := cfg.managedCertInStorageNeedsRenewal(ctx, cert)
if err != nil {
// hmm, weird, but not a big deal, maybe it was deleted or something
log.Warn("error while checking if stored certificate is also expiring soon",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
} else if !storedCertExpiring {
// if the certificate is NOT expiring soon and there was no error, then we
} else if !storedCertNeedsRenew {
// if the certificate does NOT need renewal and there was no error, then we
// are good to just reload the certificate from storage instead of repeating
// a likely-unnecessary renewal procedure
reloadQueue = append(reloadQueue, cert)
@ -161,11 +167,30 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
// NOTE: It is super-important to note that the TLS-ALPN challenge requires
// a write lock on the cache in order to complete its challenge, so it is extra
// vital that this renew operation does not happen inside our read lock!
renewQueue = append(renewQueue, cert)
renewQueue.insert(cert)
}
}
certCache.mu.RUnlock()
// Update ARI, and then for any certs where the ARI window changed,
// be sure to queue them for renewal if necessary
for _, cert := range ariQueue {
cfg := configs[cert.hash]
cert, changed, err := cfg.updateARI(ctx, cert, log)
if err != nil {
log.Error("updating ARI", zap.Error(err))
}
if changed && cert.NeedsRenewal(cfg) {
// it's theoretically possible that another instance already got the memo
// on the changed ARI and even renewed the cert already, and thus doing it
// here is wasteful, but I have never heard of this happening in reality,
// so to save some cycles for now I think we'll just queue it for renewal
// (notice how we use 'insert' to avoid duplicates, in case it was already
// scheduled for renewal anyway)
renewQueue.insert(cert)
}
}
// Reload certificates that merely need to be updated in memory
for _, oldCert := range reloadQueue {
timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC())
@ -173,7 +198,7 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
zap.Strings("identifiers", oldCert.Names),
zap.Duration("remaining", timeLeft))
cfg := configs[oldCert.Names[0]]
cfg := configs[oldCert.hash]
// crucially, this happens OUTSIDE a lock on the certCache
_, err := cfg.reloadManagedCertificate(ctx, oldCert)
@ -187,7 +212,7 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
// Renewal queue
for _, oldCert := range renewQueue {
cfg := configs[oldCert.Names[0]]
cfg := configs[oldCert.hash]
err := certCache.queueRenewalTask(ctx, oldCert, cfg)
if err != nil {
log.Error("queueing renewal task",
@ -390,6 +415,171 @@ func (certCache *Cache) updateOCSPStaples(ctx context.Context) {
}
}
// storageHasNewerARI returns true if the configured storage has ARI that is newer
// than that of a certificate that is already loaded, along with the value from
// storage.
func (cfg *Config) storageHasNewerARI(ctx context.Context, cert Certificate) (bool, acme.RenewalInfo, error) {
storedCertData, err := cfg.loadStoredACMECertificateMetadata(ctx, cert)
if err != nil || storedCertData.RenewalInfo == nil {
return false, acme.RenewalInfo{}, err
}
// prefer stored info if it has a window and the loaded one doesn't,
// or if the one in storage has a later RetryAfter (though I suppose
// it's not guaranteed, typically those will move forward in time)
if (!cert.ari.HasWindow() && storedCertData.RenewalInfo.HasWindow()) ||
storedCertData.RenewalInfo.RetryAfter.After(*cert.ari.RetryAfter) {
return true, *storedCertData.RenewalInfo, nil
}
return false, acme.RenewalInfo{}, nil
}
// loadStoredACMECertificateMetadata loads the stored ACME certificate data
// from the cert's sidecar JSON file.
func (cfg *Config) loadStoredACMECertificateMetadata(ctx context.Context, cert Certificate) (acme.Certificate, error) {
metaBytes, err := cfg.Storage.Load(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0]))
if err != nil {
return acme.Certificate{}, fmt.Errorf("loading cert metadata: %w", err)
}
var certRes CertificateResource
if err = json.Unmarshal(metaBytes, &certRes); err != nil {
return acme.Certificate{}, fmt.Errorf("unmarshaling cert metadata: %w", err)
}
var acmeCert acme.Certificate
if err = json.Unmarshal(certRes.IssuerData, &acmeCert); err != nil {
return acme.Certificate{}, fmt.Errorf("unmarshaling potential ACME issuer metadata: %v", err)
}
return acmeCert, nil
}
// updateARI updates the cert's ACME renewal info, first by checking storage for a newer
// one, or getting it from the CA if needed. The updated info is stored in storage and
// updated in the cache. The certificate with the updated ARI is returned. If true is
// returned, the ARI window or selected time has changed, and the caller should check if
// the cert needs to be renewed now, even if there is an error.
func (cfg *Config) updateARI(ctx context.Context, cert Certificate, logger *zap.Logger) (updatedCert Certificate, changed bool, err error) {
logger = logger.With(
zap.Strings("identifiers", cert.Names),
zap.String("cert_hash", cert.hash),
zap.String("ari_unique_id", cert.ari.UniqueIdentifier),
zap.Time("cert_expiry", cert.Leaf.NotAfter))
updatedCert = cert
oldARI := cert.ari
// see if the stored value has been refreshed already by another instance
gotNewARI, newARI, err := cfg.storageHasNewerARI(ctx, cert)
// when we're all done, log if something about the schedule is different
// ("WARN" level because ARI window changing may be a sign of external trouble
// and we want to draw their attention to a potential explanation URL)
defer func() {
changed = !newARI.SameWindow(oldARI)
if changed {
logger.Warn("ARI window or selected renewal time changed",
zap.Time("prev_start", oldARI.SuggestedWindow.Start),
zap.Time("next_start", newARI.SuggestedWindow.Start),
zap.Time("prev_end", oldARI.SuggestedWindow.End),
zap.Time("next_end", newARI.SuggestedWindow.End),
zap.Time("prev_selected_time", oldARI.SelectedTime),
zap.Time("next_selected_time", newARI.SelectedTime),
zap.String("explanation_url", newARI.ExplanationURL))
}
}()
if err == nil && gotNewARI {
// great, storage has a newer one we can use
cfg.certCache.mu.Lock()
updatedCert = cfg.certCache.cache[cert.hash]
updatedCert.ari = newARI
cfg.certCache.cache[cert.hash] = updatedCert
cfg.certCache.mu.Unlock()
logger.Info("reloaded ARI with newer one in storage",
zap.Timep("next_refresh", newARI.RetryAfter),
zap.Time("renewal_time", newARI.SelectedTime))
return
}
if err != nil {
logger.Error("error while checking storage for updated ARI; updating ARI now", zap.Error(err))
}
// of the issuers configured, hopefully one of them is the ACME CA we got the cert from
for _, iss := range cfg.Issuers {
if acmeIss, ok := iss.(*ACMEIssuer); ok {
newARI, err = acmeIss.getRenewalInfo(ctx, cert) // be sure to use existing newARI variable so we can compare against old value in the defer
if err != nil {
// could be anything, but a common error might simply be the "wrong" ACME CA
// (meaning, different from the one that issued the cert, thus the only one
// that would have any ARI for it) if multiple ACME CAs are configured
logger.Error("failed updating renewal info from ACME CA",
zap.String("issuer", iss.IssuerKey()),
zap.Error(err))
continue
}
// when we get the latest ARI, the acme package will select a time within the window
// for us; of course, since it's random, it's likely different from the previously-
// selected time; but if the window doesn't change, there's no need to change the
// selected time (the acme package doesn't know the previous window to know better)
// ... so if the window hasn't changed we'll just put back the selected time
if newARI.SameWindow(oldARI) && !oldARI.SelectedTime.IsZero() {
newARI.SelectedTime = oldARI.SelectedTime
}
// then store the updated ARI (even if the window didn't change, the Retry-After
// likely did) in cache and storage
// be sure we get the cert from the cache while inside a lock to avoid logical races
cfg.certCache.mu.Lock()
updatedCert = cfg.certCache.cache[cert.hash]
updatedCert.ari = newARI
cfg.certCache.cache[cert.hash] = updatedCert
cfg.certCache.mu.Unlock()
// update the ARI value in storage
var certData acme.Certificate
certData, err = cfg.loadStoredACMECertificateMetadata(ctx, cert)
if err != nil {
err = fmt.Errorf("got new ARI from %s, but failed loading stored certificate metadata: %v", iss.IssuerKey(), err)
return
}
certData.RenewalInfo = &newARI
var certDataBytes, certResBytes []byte
certDataBytes, err = json.Marshal(certData)
if err != nil {
err = fmt.Errorf("got new ARI from %s, but failed marshaling certificate ACME metadata: %v", iss.IssuerKey(), err)
return
}
certResBytes, err = json.MarshalIndent(CertificateResource{
SANs: cert.Names,
IssuerData: certDataBytes,
}, "", "\t")
if err != nil {
err = fmt.Errorf("got new ARI from %s, but could not re-encode certificate metadata: %v", iss.IssuerKey(), err)
return
}
if err = cfg.Storage.Store(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0]), certResBytes); err != nil {
err = fmt.Errorf("got new ARI from %s, but could not store it with certificate metadata: %v", iss.IssuerKey(), err)
return
}
logger.Info("updated ACME renewal information",
zap.Time("selected_time", newARI.SelectedTime),
zap.Timep("next_update", newARI.RetryAfter),
zap.String("explanation_url", newARI.ExplanationURL))
return
}
}
err = fmt.Errorf("could not fully update ACME renewal info: either no ACME issuer configured for certificate, or all failed (make sure the ACME CA that issued the certificate is configured)")
return
}
// CleanStorageOptions specifies how to clean up a storage unit.
type CleanStorageOptions struct {
// Optional custom logger.
@ -725,6 +915,19 @@ func certShouldBeForceRenewed(cert Certificate) bool {
cert.ocsp.Status == ocsp.Revoked
}
type certList []Certificate
// insert appends cert to the list if it is not already in the list.
// Efficiency: O(n)
func (certs *certList) insert(cert Certificate) {
for _, c := range *certs {
if c.hash == cert.hash {
return
}
}
*certs = append(*certs, cert)
}
const (
// DefaultRenewCheckInterval is how often to check certificates for expiration.
// Scans are very lightweight, so this can be semi-frequent. This default should

View File

@ -258,7 +258,11 @@ func (iss *ZeroSSLIssuer) Revoke(ctx context.Context, cert CertificateResource,
default:
return fmt.Errorf("unsupported reason: %d", reason)
}
return iss.getClient().RevokeCertificate(ctx, cert.IssuerData.(zerossl.CertificateObject).ID, r)
var certObj zerossl.CertificateObject
if err := json.Unmarshal(cert.IssuerData, &certObj); err != nil {
return err
}
return iss.getClient().RevokeCertificate(ctx, certObj.ID, r)
}
func (iss *ZeroSSLIssuer) getDistributedValidationInfo(ctx context.Context, identifier string) (acme.Challenge, bool, error) {