Skip to content

Commit

Permalink
image cache with a bypass option
Browse files Browse the repository at this point in the history
  • Loading branch information
FranciscoMoretti committed Sep 5, 2024
1 parent 5dddef0 commit 86f4fc4
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 36 deletions.
23 changes: 14 additions & 9 deletions packages/download-notion/src/NotionImage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import {
import fs from "fs-extra"

import { NotionObject } from "./NotionObject"
import { FileData, ImageSet, readPrimaryImage } from "./imagesUtils"
import { FileData, ImageSet, readImage } from "./imagesUtils"
import { getImageUrl } from "./notion_objects_utils"

export type PageObjectResponseWithCover = PageObjectResponse & {
Expand Down Expand Up @@ -45,7 +45,7 @@ export class NotionImage implements NotionObject {
private parseImageBlock(imageBlock: ImageBlockObjectResponse): ImageSet {
const imageObject = imageBlock.image
return {
primaryUrl: getImageUrl(imageObject),
url: getImageUrl(imageObject),
caption:
imageBlock.image.caption?.map((c) => c.plain_text).join("") || "",
}
Expand All @@ -54,21 +54,26 @@ export class NotionImage implements NotionObject {
page: PageObjectResponseWithCover | DatabaseObjectResponseWithCover
): ImageSet {
const primaryUrl = getImageUrl(page.cover)
return { primaryUrl, caption: "" }
return { url: primaryUrl, caption: "" }
}

async read() {
async download() {
if (this.fileData) {
return this.fileData
}
return await this.readAndSetFileData(this.imageSet.url, "url")
}
async readFromFile(path: string) {
return await this.readAndSetFileData(path, "file")
}

const { primaryBuffer, fileType } = await readPrimaryImage(
this.imageSet.primaryUrl
)
// TODO: Consider extracting to util
private async readAndSetFileData(source: string, type: "file" | "url") {
const { buffer, fileType } = await readImage(source, type)
this.fileData = {
extension: fileType.ext,
mime: fileType.mime,
buffer: primaryBuffer,
buffer,
}
return this.fileData
}
Expand All @@ -85,7 +90,7 @@ export class NotionImage implements NotionObject {
}

get url(): string {
return this.imageSet.primaryUrl
return this.imageSet.url
}

get caption(): string | undefined {
Expand Down
2 changes: 1 addition & 1 deletion packages/download-notion/src/fetchImages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async function fetchImageAndSaveToCache(
outputDir: string,
imagesCacheFilesMap: FilesMap
) {
const imageData = await image.read()
const imageData = await image.download()
const imagePath = path.join(outputDir, `${image.id}.${imageData.extension}`)
await image.save(imagePath)
imagesCacheFilesMap.set("image", image.id, {
Expand Down
41 changes: 23 additions & 18 deletions packages/download-notion/src/imagesUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export type OutputPaths = {
}

export type ImageSet = {
primaryUrl: string
url: string
caption?: string
}

Expand All @@ -38,29 +38,34 @@ export function updateImageUrlToMarkdownImagePath(
}
}

export async function readPrimaryImage(url: string) {
export async function readImage(source: string, type: "file" | "url") {
try {
// Keep alive with a long timeout solved some image retrieval issues. Maybe we should consider retries with exponential
// back-offs if this becomes an issue again.
const response = await axios.get(url, {
responseType: "arraybuffer",
httpsAgent: new https.Agent({ keepAlive: true }),
timeout: 10000,
})
const primaryBuffer = Buffer.from(response.data, "utf-8")
const fileType = await FileType.fromBuffer(primaryBuffer)
const buffer = await readBuffer(source, type)
const fileType = await FileType.fromBuffer(buffer)

if (!fileType) {
throw new Error(`Failed to determine file type for image at ${url}`)
throw new Error(`Failed to determine file type for image at ${source}`)
}

return {
primaryBuffer,
fileType,
}
return { buffer, fileType }
} catch (error) {
console.error(`Error fetching image from ${url}:`, error)
throw error // Re-throw the error if you want calling functions to handle it
console.error(`Error reading image from ${source}:`, error)
throw error
}
}

async function readBuffer(source: string, type: "file" | "url") {
if (type === "url") {
const response = await axios.get(source, {
responseType: "arraybuffer",
httpsAgent: new https.Agent({ keepAlive: true }),
timeout: 10000,
})
return Buffer.from(response.data)
} else if (type === "file") {
return await fs.readFile(source)
} else {
throw new Error(`Invalid type ${type}`)
}
}

Expand Down
15 changes: 9 additions & 6 deletions packages/download-notion/src/notionPull.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { NotionToMarkdown } from "notion-to-md"

import { FilesCleaner } from "./FilesCleaner"
import { FilesManager, ObjectPrefixDict } from "./FilesManager"
import { FileType } from "./FilesMap"
import { FileType, FilesMap } from "./FilesMap"
import { FlatLayoutStrategy } from "./FlatLayoutStrategy"
import { HierarchicalLayoutStrategy } from "./HierarchicalLayoutStrategy"
import { ImageNamingStrategy } from "./ImageNamingStrategy"
Expand All @@ -38,7 +38,7 @@ import {
import { removePathExtension } from "./pathUtils"
import { convertInternalUrl } from "./plugins/internalLinks"
import { IDocuNotionContext } from "./plugins/pluginTypes"
import { applyToAllImages, downloadAndUpdateMetadata } from "./processImages"
import { applyToAllImages, readAndUpdateMetadata } from "./processImages"
import { getMarkdownForPage } from "./transform"
import {
convertToUUID,
Expand Down Expand Up @@ -219,10 +219,12 @@ export async function notionPull(options: NotionPullOptions): Promise<void> {

const objectsTree = new NotionObjectTree(objectsTreeRootNode, objectsData)

const imagesCacheDir = cacheDir + "images/"
const imagesCacheFilesMap = await fetchImages(objectsTree, imagesCacheDir)
let imagesCacheFilesMap: FilesMap | undefined = undefined
if (options.cache.cacheImages) {
const imagesCacheDir = cacheDir + "images/"
imagesCacheFilesMap = await fetchImages(objectsTree, imagesCacheDir)
}

console.log(imagesCacheFilesMap)
info("PULL: Notion Download Completed")
if (options.conversion.skip) {
return
Expand Down Expand Up @@ -274,11 +276,12 @@ export async function notionPull(options: NotionPullOptions): Promise<void> {
await applyToAllImages({
objectsTree,
applyToImage: async (image) => {
await downloadAndUpdateMetadata({
await readAndUpdateMetadata({
image,
existingFilesManager,
newFilesManager,
imageNamingStrategy,
imagesCacheFilesMap,
})
},
})
Expand Down
12 changes: 10 additions & 2 deletions packages/download-notion/src/processImages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
import { NotionObjectTree } from "notion-downloader"

import { FilesManager, copyRecord } from "./FilesManager"
import { FilesMap } from "./FilesMap"
import { ImageNamingStrategy } from "./ImageNamingStrategy"
import { NotionDatabase } from "./NotionDatabase"
import {
Expand All @@ -16,19 +17,26 @@ import {
import { NotionPage } from "./NotionPage"
import { updateImageUrlToMarkdownImagePath } from "./imagesUtils"

export async function downloadAndUpdateMetadata({
export async function readAndUpdateMetadata({
image,
existingFilesManager,
newFilesManager,
imageNamingStrategy,
imagesCacheFilesMap,
}: {
image: NotionImage
existingFilesManager: FilesManager
newFilesManager: FilesManager
imageNamingStrategy: ImageNamingStrategy
imagesCacheFilesMap: FilesMap | undefined
}) {
if (existingFilesManager.isObjectNew(image)) {
await image.read()
if (imagesCacheFilesMap) {
const cachedImage = imagesCacheFilesMap.get("image", image.id)
await image.readFromFile(cachedImage.path)
} else {
await image.download()
}
// TODO: Write here a layout naming strategy for images. Name is ok, but path is not.
const imageFilename = imageNamingStrategy.getFileName(image)
newFilesManager.set("base", "image", image.id, {
Expand Down
1 change: 1 addition & 0 deletions packages/notion-downloader/src/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ export const cacheOptionsSchema = z
cacheStrategy: z
.enum(["cache", "no-cache", "force-cache"])
.default("cache"),
cacheImages: z.boolean().default(true),
})
.default({})

0 comments on commit 86f4fc4

Please sign in to comment.