open-design/scripts/import-prompt-templates.mjs

#!/usr/bin/env node
/**
 * Pulls down the upstream prompt corpora (CC BY 4.0) and emits curated
 * JSON files under `prompt-templates/{image,video}/`. Re-run anytime to
 * pick up new featured prompts.
 *
 * Usage:
 *   node scripts/import-prompt-templates.mjs
 *
 * Source READMEs:
 *   - https://github.com/YouMind-OpenLab/awesome-gpt-image-2 (CC BY 4.0)
 *   - https://github.com/YouMind-OpenLab/awesome-seedance-2-prompts (CC BY 4.0)
 *
 * Each upstream README is a structured catalog. Two patterns we care about:
 *
 *   Featured block:
 *     ### No. N: <Title>
 *     <badges>
 *     #### 📖 Description
 *     <description paragraph>
 *     #### 📝 Prompt
 *     ```
 *     <prompt body>
 *     ```
 *     #### 🎬 Video  (or 🖼️ Generated Images)
 *     <preview img / video link>
 *     #### 📌 Details
 *     - **Author:** [Name](url)
 *     - **Source:** [Twitter Post](url)
 *     - **Published:** ...
 *
 *   All-Prompts block:
 *     ### <Title>
 *     <badges>
 *     > <description>
 *     #### 📝 Prompt
 *     ```
 *     <prompt body>
 *     ```
 *     <img src="<thumb>"> | <a href=...>
 *     **Author:** [Name](url) | **Source:** [Link](url) | **Published:** ...
 *
 * We pick the featured 6 from each repo (always good) plus a sampled slice
 * of the All-Prompts head so the gallery has breadth across categories.
 *
 * All output JSON carries a `source` block so attribution stays intact.
 */

import { mkdir, writeFile, readdir, unlink, readFile } from 'node:fs/promises';
import path from 'node:path';
import { fileURLToPath } from 'node:url';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const ROOT = path.resolve(__dirname, '..');
const OUT_IMAGE = path.join(ROOT, 'prompt-templates', 'image');
const OUT_VIDEO = path.join(ROOT, 'prompt-templates', 'video');

const SOURCES = [
  {
    surface: 'image',
    repo: 'YouMind-OpenLab/awesome-gpt-image-2',
    license: 'CC-BY-4.0',
    readmeUrl:
      'https://raw.githubusercontent.com/YouMind-OpenLab/awesome-gpt-image-2/main/README.md',
    defaultModel: 'gpt-image-2',
    defaultAspect: '1:1',
    // Cap how many entries we pull from the "All Prompts" tail to keep the
    // committed dataset reviewable. The featured block is always taken.
    sampleAllPrompts: 30,
  },
  {
    surface: 'video',
    repo: 'YouMind-OpenLab/awesome-seedance-2-prompts',
    license: 'CC-BY-4.0',
    readmeUrl:
      'https://raw.githubusercontent.com/YouMind-OpenLab/awesome-seedance-2-prompts/main/README.md',
    defaultModel: 'seedance-2.0',
    defaultAspect: '16:9',
    sampleAllPrompts: 30,
  },
];

async function fetchText(url) {
  const resp = await fetch(url);
  if (!resp.ok) {
    throw new Error(`failed ${url}: ${resp.status}`);
  }
  return resp.text();
}

function slugify(input) {
  return input
    .toLowerCase()
    .normalize('NFKD')
    .replace(/[\u0300-\u036f]/g, '')
    .replace(/[^a-z0-9]+/g, '-')
    .replace(/^-+|-+$/g, '')
    .slice(0, 64);
}

// Featured blocks come between the "🔥 Featured Prompts" / "⭐ Featured" /
// "## 🔥 Featured Prompts" header and the next H2.
function sliceSection(md, headerRe) {
  const match = headerRe.exec(md);
  if (!match) return '';
  const start = match.index + match[0].length;
  const next = md.slice(start).search(/\n## /);
  if (next === -1) return md.slice(start);
  return md.slice(start, start + next);
}

function parseFeaturedBlock(block, ctx) {
  const out = [];
  // Each featured prompt starts at "### No. N: Title".
  const headerRe = /^### No\. \d+: (.+?)\s*$/gm;
  const headers = [];
  let m;
  while ((m = headerRe.exec(block)) !== null) {
    headers.push({ index: m.index, end: m.index + m[0].length, title: m[1] });
  }
  for (let i = 0; i < headers.length; i += 1) {
    const h = headers[i];
    const next = headers[i + 1]?.index ?? block.length;
    const body = block.slice(h.end, next);
    const entry = parseEntryBody(body, h.title, ctx, true);
    if (entry) out.push(entry);
  }
  return out;
}

function parseAllPromptsBlock(block, ctx) {
  const out = [];
  // The "All Prompts" section uses "### <Title>" headers — sometimes
  // prefixed with "No. N:" (gpt-image-2 README), sometimes bare
  // (seedance README). Both shapes route through parseEntryBody which
  // strips the "No. N:" prefix where present.
  const headerRe = /^### (.+?)\s*$/gm;
  const headers = [];
  let m;
  while ((m = headerRe.exec(block)) !== null) {
    const title = m[1].replace(/^No\.\s*\d+:\s*/, '').trim();
    headers.push({ index: m.index, end: m.index + m[0].length, title });
  }
  for (let i = 0; i < headers.length && out.length < ctx.sampleAllPrompts; i += 1) {
    const h = headers[i];
    const next = headers[i + 1]?.index ?? block.length;
    const body = block.slice(h.end, next);
    const entry = parseEntryBody(body, h.title, ctx, false);
    if (entry) out.push(entry);
  }
  return out;
}

function parseEntryBody(body, title, ctx, featured) {
  const promptMatch = /#### 📝 Prompt\s*\n+```[a-zA-Z0-9_-]*\n([\s\S]*?)```/m.exec(
    body,
  );
  if (!promptMatch) return null;
  const prompt = promptMatch[1].trim();
  if (prompt.length < 40) return null;

  // The image README structures every entry — featured AND in-list —
  // with a "#### 📖 Description" block. The seedance README only does
  // that for featured; in-list entries fall back to a leading blockquote.
  // Try the structured form first regardless, then fall back.
  const description =
    extractDescription(body) || extractBlockquoteSummary(body);
  const author = extractAuthor(body);
  const sourceUrl = extractSourceUrl(body) ?? null;
  const previewImage = extractFirstImage(body);
  const previewVideo = extractVideoLink(body);
  const category = inferCategory(title, ctx.surface);
  const tags = inferTags(title, prompt, ctx.surface);

  return {
    id: slugify(title),
    surface: ctx.surface,
    title: cleanTitle(title),
    summary: (description || cleanTitle(title)).slice(0, 200),
    category,
    tags,
    model: ctx.defaultModel,
    aspect: ctx.defaultAspect,
    prompt,
    previewImageUrl: previewImage ?? undefined,
    previewVideoUrl: previewVideo ?? undefined,
    source: {
      repo: ctx.repo,
      license: ctx.license,
      author: author ?? undefined,
      url: sourceUrl ?? undefined,
    },
  };
}

function extractDescription(body) {
  const m = /#### 📖 Description\s*\n+([\s\S]*?)(?=\n+####|\n+---)/m.exec(body);
  return m?.[1]?.trim().replace(/\s+/g, ' ') ?? '';
}

function extractBlockquoteSummary(body) {
  const m = /^>\s*(.+?)\s*$/m.exec(body);
  return m?.[1]?.trim() ?? '';
}

function extractAuthor(body) {
  // Featured: "- **Author:** [Name](url)"
  // All-prompts: "**Author:** [Name](url) | ..."
  const m = /\*\*Author:\*\*\s*\[([^\]]+)\]/.exec(body);
  return m?.[1]?.trim() ?? null;
}

function extractSourceUrl(body) {
  const m = /\*\*Source:\*\*\s*\[[^\]]+\]\(([^)]+)\)/.exec(body);
  return m?.[1]?.trim() ?? null;
}

function extractFirstImage(body) {
  const m = /<img[^>]*src=["']([^"']+)["']/.exec(body);
  if (!m) return null;
  return m[1];
}

function extractVideoLink(body) {
  // 1) Featured entries embed an explicit "<a href=...releases/.../<id>.mp4">"
  //    download link — prefer it. GitHub releases are stable and don't
  //    rely on a per-request signed redirect. Catches all 6 featured
  //    prompts in awesome-seedance-2-prompts.
  const releaseLink = /href=["']([^"']+\.mp4)["']/.exec(body);
  if (releaseLink) return releaseLink[1];
  // 2) All-prompts entries don't expose a static mp4 — they only embed
  //    the Cloudflare Stream thumbnail. Reconstruct the playable mp4
  //    from the Stream video id encoded in the thumbnail URL. The
  //    /downloads/default.mp4 endpoint 302s to a freshly-signed CDN
  //    URL on every request; the browser follows that transparently
  //    when set as <video src>. CORS is permissive (`*` on origin)
  //    and `accept-ranges: bytes` is honored, so seeking works too.
  //    This is what unlocks an actual video preview for the other
  //    ~30 sampled templates instead of a static thumbnail.
  const streamThumb =
    /https?:\/\/([a-z0-9-]+\.cloudflarestream\.com)\/([a-f0-9]{20,})\/thumbnails\/thumbnail\.jpg/i.exec(
      body,
    );
  if (streamThumb) {
    return `https://${streamThumb[1]}/${streamThumb[2]}/downloads/default.mp4`;
  }
  return null;
}

function cleanTitle(raw) {
  // "Profile / Avatar - Cyberpunk Anime …" → strip the leading category
  // prefix shared by every entry in the same gpt-image-2 bucket. Keeps
  // titles scannable on cards without losing meaning.
  return raw
    .replace(/\s*\(.*\)\s*$/, '')
    .replace(/^\s*[-–]\s*/, '')
    .trim();
}

function inferCategory(title, surface) {
  const lower = title.toLowerCase();
  if (surface === 'image') {
    if (/profile|avatar|portrait/.test(lower)) return 'Profile / Avatar';
    if (/social|post|carousel/.test(lower)) return 'Social Media Post';
    if (/info[ -]?graphic|chart|diagram/.test(lower)) return 'Infographic';
    if (/youtube|thumbnail/.test(lower)) return 'YouTube Thumbnail';
    if (/comic|storyboard|panel/.test(lower)) return 'Comic / Storyboard';
    if (/poster|flyer/.test(lower)) return 'Poster / Flyer';
    if (/ui|app|web design|mockup|landing/.test(lower)) return 'App / Web Design';
    if (/product|exploded|merch|packaging/.test(lower)) return 'Product Marketing';
    if (/anime|manga/.test(lower)) return 'Anime / Manga';
    if (/cinematic|film/.test(lower)) return 'Cinematic';
    if (/3d|render|isometric/.test(lower)) return '3D Render';
    if (/sketch|line art|pencil/.test(lower)) return 'Sketch / Line Art';
    if (/pixel/.test(lower)) return 'Pixel Art';
    if (/oil|water[- ]?color/.test(lower)) return 'Painterly';
    if (/cyberpunk|sci[- ]?fi|futuristic/.test(lower)) return 'Cyberpunk / Sci-Fi';
    if (/landscape|nature/.test(lower)) return 'Landscape';
    return 'Illustration';
  }
  // video
  if (/cinematic|film|movie|noir/.test(lower)) return 'Cinematic';
  if (/anime|manga/.test(lower)) return 'Anime';
  if (/ad|advert|commercial|brand/.test(lower)) return 'Advertising';
  if (/ugc|tutorial|vlog/.test(lower)) return 'UGC / Vlog';
  if (/meme|tiktok|viral/.test(lower)) return 'Social / Meme';
  if (/drama|short film|romance/.test(lower)) return 'Short Film / Drama';
  if (/intro|motion graphics|title sequence/.test(lower)) return 'Motion Graphics';
  if (/vfx|fantasy|magic/.test(lower)) return 'VFX / Fantasy';
  if (/race|action|combat|fight/.test(lower)) return 'Action';
  return 'General';
}

function inferTags(title, prompt, surface) {
  const set = new Set();
  const blob = `${title} ${prompt}`.toLowerCase();
  const checks = [
    ['portrait', /portrait|selfie|headshot/],
    ['anime', /anime|manga/],
    ['cinematic', /cinematic|filmic|grain|8k/],
    ['cyberpunk', /cyberpunk|neon/],
    ['fantasy', /fantasy|mage|elf|dragon/],
    ['3d-render', /3d render|unreal engine|render/],
    ['isometric', /isometric/],
    ['typography', /typography|kerning|font|lettering/],
    ['product', /product|packaging|exploded/],
    ['ugc', /ugc|vlog|selfie cam/],
    ['cinematic-romance', /romance|pure love|romantic/],
    ['action', /chase|action|combat|race/],
    ['food', /food|coffee|kitchen/],
    ['nature', /forest|river|mountain|landscape/],
  ];
  for (const [tag, re] of checks) {
    if (re.test(blob)) set.add(tag);
  }
  const lim = surface === 'image' ? 4 : 3;
  return Array.from(set).slice(0, lim);
}

// Remove previously generated JSON files. Hand-authored templates (those
// whose `source.repo` is not the upstream CC-BY corpus we import from) are
// preserved so first-party curated prompts aren't wiped on re-run.
async function clearDir(dir, upstreamRepo) {
  try {
    const files = await readdir(dir);
    for (const f of files) {
      if (!f.endsWith('.json')) continue;
      const filePath = path.join(dir, f);
      let keep = false;
      try {
        const parsed = JSON.parse(await readFile(filePath, 'utf8'));
        const repo = parsed?.source?.repo;
        if (repo && repo !== upstreamRepo) keep = true;
      } catch {
        // Unparseable file — treat as generated and remove.
      }
      if (!keep) await unlink(filePath);
    }
  } catch {
    // missing dir is fine — created below.
  }
}

async function writeAll(entries, outDir, upstreamRepo) {
  await mkdir(outDir, { recursive: true });
  await clearDir(outDir, upstreamRepo);
  // De-dup on slug; if two entries collide, keep the first (which is the
  // featured one — always parsed before "All Prompts"). Hand-authored
  // templates already on disk (preserved by clearDir) also take priority
  // so we never overwrite curated first-party prompts.
  const seen = new Set();
  try {
    const existing = await readdir(outDir);
    for (const f of existing) {
      if (f.endsWith('.json')) seen.add(f.replace(/\.json$/, ''));
    }
  } catch {
    // noop
  }
  let count = 0;
  for (const entry of entries) {
    if (seen.has(entry.id)) continue;
    seen.add(entry.id);
    const filePath = path.join(outDir, `${entry.id}.json`);
    await writeFile(filePath, `${JSON.stringify(entry, null, 2)}\n`, 'utf8');
    count += 1;
  }
  return count;
}

async function main() {
  let totalImage = 0;
  let totalVideo = 0;
  for (const ctx of SOURCES) {
    const md = await fetchText(ctx.readmeUrl);
    const featuredBlock = sliceSection(md, /## 🔥 Featured Prompts/m)
      || sliceSection(md, /## ⭐ Featured Prompts/m)
      || sliceSection(md, /## Featured/m);
    const allPromptsBlock = sliceSection(md, /## (📋|🎬) All Prompts/m)
      || sliceSection(md, /## All Prompts/m);
    const featured = parseFeaturedBlock(featuredBlock, ctx);
    const sampled = parseAllPromptsBlock(allPromptsBlock, ctx);
    const entries = [...featured, ...sampled];
    if (entries.length === 0) {
      console.error(`No entries parsed for ${ctx.repo}; check headers.`);
      process.exitCode = 1;
      continue;
    }
    const outDir = ctx.surface === 'image' ? OUT_IMAGE : OUT_VIDEO;
    const written = await writeAll(entries, outDir, ctx.repo);
    if (ctx.surface === 'image') totalImage += written;
    else totalVideo += written;
    console.log(
      `[${ctx.repo}] featured=${featured.length} sampled=${sampled.length} written=${written} → ${path.relative(ROOT, outDir)}`,
    );
  }
  console.log(`\nDone. ${totalImage} image + ${totalVideo} video templates.`);
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});