typesense/pkg/indexing/contentserver.go

package typesenseindexing

import (
	"context"
	"fmt"

	"slices"

	contentserverclient "github.com/foomo/contentserver/client"
	"github.com/foomo/contentserver/content"
	pkgx "github.com/foomo/typesense/pkg"
	"go.uber.org/zap"
)

const ContentserverDataAttributeNoIndex = "typesenseIndexing-noIndex"

type ContentServer[indexDocument any] struct {
	l                     *zap.Logger
	contentserverClient   *contentserverclient.Client
	documentProviderFuncs map[pkgx.DocumentType]pkgx.DocumentProviderFunc[indexDocument]
	supportedMimeTypes    []string
}

func NewContentServer[indexDocument any](
	l *zap.Logger,
	client *contentserverclient.Client,
	documentProviderFuncs map[pkgx.DocumentType]pkgx.DocumentProviderFunc[indexDocument],
	supportedMimeTypes []string,
) *ContentServer[indexDocument] {
	return &ContentServer[indexDocument]{
		l:                     l,
		contentserverClient:   client,
		documentProviderFuncs: documentProviderFuncs,
		supportedMimeTypes:    supportedMimeTypes,
	}
}

// Provide retrieves documents for the given indexID from the content server.
// It fetches the document IDs, retrieves the URLs for those IDs, and then uses the
// document provider functions to create the documents.
// The documents are returned as a slice of pointers to the indexDocument type.
// If a document provider function is not available for a specific document type,
// a warning is logged and that document is skipped.
func (c ContentServer[indexDocument]) Provide(
	ctx context.Context,
	indexID pkgx.IndexID,
) ([]*indexDocument, error) {
	documentInfos, err := c.getDocumentIDsByIndexID(ctx, indexID)
	if err != nil {
		return nil, err
	}

	urlsByIDs, err := c.fetchURLsByDocumentIDs(ctx, indexID, documentInfos)
	if err != nil {
		return nil, err
	}

	documents := make([]*indexDocument, len(documentInfos))
	for index, documentInfo := range documentInfos {
		if documentProvider, ok := c.documentProviderFuncs[documentInfo.DocumentType]; !ok {
			c.l.Warn(
				"no document provider available for document type",
				zap.String("documentType", string(documentInfo.DocumentType)),
			)
		} else {
			document, err := documentProvider(ctx, indexID, documentInfo.DocumentID, urlsByIDs)
			if err != nil {
				c.l.Error(
					"index document not created",
					zap.Error(err),
					zap.String("documentID", string(documentInfo.DocumentID)),
					zap.String("documentType", string(documentInfo.DocumentType)),
				)
				continue
			}
			if document != nil {
				documents[index] = document
			}
		}
	}
	return documents, nil
}

// ProvidePaged
func (c ContentServer[indexDocument]) ProvidePaged(
	ctx context.Context,
	indexID pkgx.IndexID,
	offset int,
) ([]*indexDocument, int, error) {
	panic("implement me")
}

func (c ContentServer[indexDocument]) getDocumentIDsByIndexID(
	ctx context.Context,
	indexID pkgx.IndexID,
) ([]pkgx.DocumentInfo, error) {
	// get the contentserver dimension defined by indexID
	// create the list of document infos
	repo, err := c.contentserverClient.GetRepo(ctx)
	if err != nil {
		return nil, err
	}
	rootRepoNode, ok := repo[string(indexID)]
	if !ok {
		return nil, fmt.Errorf("contenserver dimension %s not found", indexID)
	}

	nodeMap := createFlatRepoNodeMap(rootRepoNode, map[string]*content.RepoNode{})
	documentInfos := make([]pkgx.DocumentInfo, 0, len(nodeMap))
	for _, repoNode := range nodeMap {
		if !includeNode(c.supportedMimeTypes, repoNode) {
			c.l.Debug("skipping document indexing",
				zap.String("path", repoNode.URI),
				zap.String("mimeType", repoNode.MimeType),
				zap.Bool("hidden", repoNode.Hidden),
			)
			continue
		}

		documentInfos = append(documentInfos, pkgx.DocumentInfo{
			DocumentType: pkgx.DocumentType(repoNode.MimeType),
			DocumentID:   pkgx.DocumentID(repoNode.ID),
		})
	}

	return documentInfos, nil
}

// fetchURLsByDocumentIDs fetches the URLs for the given document IDs from the content server.
// It uses the contentserverClient to retrieve the URIs and maps them to DocumentID.
func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs(
	ctx context.Context,
	indexID pkgx.IndexID,
	documentInfos []pkgx.DocumentInfo,
) (map[pkgx.DocumentID]string, error) {
	ids := make([]string, len(documentInfos))

	for i, documentInfo := range documentInfos {
		ids[i] = string(documentInfo.DocumentID)
	}

	uriMap, err := c.contentserverClient.GetURIs(ctx, string(indexID), ids)
	if err != nil {
		c.l.Error("failed to get URIs", zap.Error(err))
		return nil, err
	}

	return convertMapStringToDocumentID(uriMap), nil
}

// convertMapStringToDocumentID converts a map with string keys to a map with DocumentID keys.
// The keys in the input map are converted to DocumentID type, while the values remain strings.
func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]string {
	output := make(map[pkgx.DocumentID]string, len(input))
	for key, value := range input {
		output[pkgx.DocumentID(key)] = value
	}
	return output
}

// includeNode checks if the node should be included in the indexing process.
// It checks if the node is nil, if it has the noIndex attribute set to true,
// and if its mime type is in the list of supported mime types.
func includeNode(supportedMimeTypes []string, node *content.RepoNode) bool {
	if node == nil {
		return false
	}
	if noIndex, noIndexSet := node.Data[ContentserverDataAttributeNoIndex].(bool); noIndexSet && noIndex {
		return false
	}
	if !slices.Contains(supportedMimeTypes, node.MimeType) {
		return false
	}
	return true
}

// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map.
func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode {
	if node == nil {
		return nodeMap
	}
	// Add the current node to the list.
	nodeMap[node.ID] = node
	// Recursively process child nodes.
	for _, child := range node.Nodes {
		nodeMap = createFlatRepoNodeMap(child, nodeMap)
	}
	return nodeMap
}