Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions packages/migrate/test/fixtures/substack-e2e-scrape.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">

<meta name="author" content="Test Author" />
<meta property="og:url" content="https://example.substack.com/p/plain-text" />
<title>Plain Text - Test Author</title>
<meta property="og:type" content="article"><meta property="og:title" content="Plain Text OG Title"><meta name="twitter:title" content="Plain Text Twitter Title"><meta name="description" content="A test meta description"><meta property="og:description" content="OG description for the post"><meta name="twitter:description" content="Twitter description for the post"><meta property="og:image" content="https://substack-post-media.s3.amazonaws.com/public/images/og-image.jpg"><meta name="twitter:image" content="https://substack-post-media.s3.amazonaws.com/public/images/og-image.jpg"><meta name="twitter:card" content="summary_large_image">
<link rel="canonical" href="https://example.substack.com/p/plain-text" />

<script type="application/ld+json">
{
"@context":"https://schema.org",
"@type":"NewsArticle",
"url":"https://example.substack.com/p/plain-text",
"mainEntityOfPage":"https://example.substack.com/p/plain-text",
"headline":"Plain Text",
"description":"A test meta description",
"image": [
{
"@type":"ImageObject",
"url":"https://substack-post-media.s3.amazonaws.com/public/images/og-image.jpg"
}
],
"datePublished":"2024-01-15T10:00:00+00:00",
"dateModified":"2024-01-15T10:00:00+00:00",
"isAccessibleForFree":true,
"author": {
"@type": "Person",
"name": "Test Author",
"url": "https://substack.com/@testauthor",
"description": "A test author"
},
"publisher": {
"@type": "Organization",
"name": "Test Author",
"url": "https://example.substack.com",
"description": "A Test Substack"
}
}
</script>
</head>
<body>

<p>Body content, not testing this</p>

<script>window._preloads = JSON.parse("{\"post\":{\"audience\":\"everyone\",\"canonical_url\":\"https://example.substack.com/p/plain-text\",\"podcast_art_url\":null,\"podcast_duration\":null,\"podcast_preview_upload_id\":null,\"podcast_upload_id\":null,\"podcast_url\":\"\",\"post_date\":\"2024-01-15T10:00:00.000Z\",\"search_engine_description\":\"A test meta description\",\"search_engine_title\":\"Plain Text\",\"slug\":\"plain-text\",\"social_title\":\"Plain Text OG Title\",\"subtitle\":\"A test subtitle\",\"title\":\"Plain Text\",\"type\":\"newsletter\",\"cover_image\":\"https://substack-post-media.s3.amazonaws.com/public/images/og-image.jpg\",\"postTags\":[{\"id\":\"tag-1\",\"publication_id\":100,\"name\":\"Testing\",\"slug\":\"testing\",\"hidden\":false}]}}")</script>

<script>window._analyticsConfig = JSON.parse("{\"properties\":{\"subdomain\":\"example\",\"publication_id\":100,\"section_name\":\"Tech\",\"section_slug\":\"tech\",\"section_id\":200,\"section_is_podcast\":false}}")</script>

</body>
</html>
295 changes: 295 additions & 0 deletions packages/migrate/test/substack-e2e.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
import assert from 'node:assert/strict';
import {readFile} from 'node:fs/promises';
import {existsSync, mkdirSync, writeFileSync, readFileSync, rmSync} from 'node:fs';
import {join} from 'node:path';
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for the pattern: new URL(...).pathname with import.meta.url
rg -nP "new URL\([^)]*import\.meta\.url\)\.pathname" --type=js -C 2

Repository: TryGhost/migrate

Length of output: 8601


🏁 Script executed:

#!/bin/bash
# Also search for variations without escaping to catch the pattern
rg "new URL.*import.meta.url.*pathname" --type=js -C 2

Repository: TryGhost/migrate

Length of output: 8361


🏁 Script executed:

#!/bin/bash
# Check the specific file mentioned in the review
head -60 packages/migrate/test/substack-e2e.test.js | cat -n

Repository: TryGhost/migrate

Length of output: 2981


🌐 Web query:

fileURLToPath Node.js official documentation filesystem path

💡 Result:

fileURLToPath is a Node.js API in the node:url module that converts a file: URL (as a URL object or string) into a fully-resolved, platform-specific absolute filesystem path, correctly decoding percent-encoded characters. [1]

import { fileURLToPath } from 'node:url';

const __filename = fileURLToPath(import.meta.url);

Notes:

  • Signature: url.fileURLToPath(url[, options]) (with options.windows to force Windows vs POSIX parsing). [1]
  • If you’re ultimately calling fs APIs, most fs operations accept paths as a string, Buffer, or a file: URL (URL object). [2]

Sources: Node.js URL docs (url.fileURLToPath) [1], Node.js File system docs (file paths / File URL paths) [2].


Use fileURLToPath() for filesystem-safe file URL conversion across platforms.

The .pathname property of file URLs can produce incorrect paths on Windows (forward slashes, missing drive letter handling) and with percent-encoded characters. Use fileURLToPath() from node:url instead.

🔧 Proposed fix
import {join} from 'node:path';
+import {fileURLToPath} from 'node:url';
@@
-        fixtureDir = join(
-            new URL('.', import.meta.url).pathname,
-            'fixtures',
-            'substack-e2e-tmp'
-        );
+        fixtureDir = fileURLToPath(new URL('./fixtures/substack-e2e-tmp', import.meta.url));

Also applies to: 52-56

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@packages/migrate/test/substack-e2e.test.js` at line 4, Replace any use of a
file URL's .pathname when converting to filesystem paths with node's
fileURLToPath to ensure Windows and percent-encoded characters are handled
correctly: import {fileURLToPath} from 'node:url' alongside the existing join
import, then change occurrences where you currently read fileUrl.pathname (or
similar variables like fileUrl) and pass that into join to instead call
fileURLToPath(fileUrl) and use the result; update all instances in this file
(including the uses around the join calls and the block covering the other
occurrences) so path construction uses fileURLToPath(fileUrl) + join as
appropriate.

import {execSync} from 'node:child_process';
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, check if the file exists and its line count
wc -l packages/migrate/test/substack-e2e.test.js

Repository: TryGhost/migrate

Length of output: 106


🏁 Script executed:

# Read the imports at the top of the file, focusing on line 5
head -20 packages/migrate/test/substack-e2e.test.js | cat -n

Repository: TryGhost/migrate

Length of output: 1307


🏁 Script executed:

# Check the specific lines mentioned: 5, 66-67, 144-145
sed -n '1,10p; 60,75p; 140,150p' packages/migrate/test/substack-e2e.test.js | cat -n

Repository: TryGhost/migrate

Length of output: 1973


🏁 Script executed:

# Get lines 60-75 to find the zip command
sed -n '60,75p' packages/migrate/test/substack-e2e.test.js | cat -n

Repository: TryGhost/migrate

Length of output: 837


🏁 Script executed:

# Get lines 135-150 to find the unzip command
sed -n '135,150p' packages/migrate/test/substack-e2e.test.js | cat -n

Repository: TryGhost/migrate

Length of output: 939


Replace shell-interpolated archive commands with argument-based execFileSync to prevent path escaping and injection vulnerabilities.

Lines 5, 66–67, and 144–145 use execSync with shell-interpolated zip/unzip commands, which are vulnerable to path escaping issues and command-string parsing fragility. Use execFileSync with argument arrays instead.

🔧 Proposed fix
-import {execSync} from 'node:child_process';
+import {execFileSync} from 'node:child_process';
@@
-        execSync(`zip -r ${zipPath} posts.csv posts/`, {
+        execFileSync('zip', ['-r', zipPath, 'posts.csv', 'posts/'], {
             cwd: fixtureDir
         });
@@
-        execSync(`unzip -o "${ctx.outputFile.path}" -d "${unzipDir}"`);
+        execFileSync('unzip', ['-o', ctx.outputFile.path, '-d', unzipDir]);
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@packages/migrate/test/substack-e2e.test.js` at line 5, The tests import and
use execSync with shell-interpolated zip/unzip commands (via the execSync
symbol) which is unsafe; replace those usages with execFileSync from
'node:child_process' and call execFileSync with the executable name ('zip' or
'unzip') and an array of arguments (e.g., ['-r', dest, source] or ['-o',
archive, file]) to avoid shell interpolation and path-escaping issues; update
the import to include execFileSync, find all occurrences of execSync that run
zip/unzip in this test file (the zip/unzip command invocations), and convert
them to execFileSync(executable, args, {stdio: 'inherit'}) or appropriate
options while preserving exit behavior and captured output.

import {describe, it, before, after} from 'node:test';
import nock from 'nock';
import substackSource from '../sources/substack.js';

// Smallest valid JPEG: a 1x1 red pixel
const JPEG_BUFFER = Buffer.from(
'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMCwsKCwsM' +
'DhEQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQU' +
'FBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAABAAEDASIAAhEBAxEB/8QAFAABAAAAAAAAAAAAAAAAAAAACf' +
'/EABQQAQAAAAAAAAAAAAAAAAAAAAD/xAAUAQEAAAAAAAAAAAAAAAAAAAAA/8QAFBEBAAAAAAAAAAAAAAAAAAAAAP/aAAwDAQACEQMRAD8AKwA//9k=',
'base64'
);

const CSV_CONTENT = `post_id,post_date,is_published,email_sent_at,type,audience,title,subtitle,podcast_url
123401.plain-text,2024-01-15T10:00:00.000Z,TRUE,2024-01-15T10:00:00.000Z,newsletter,everyone,Plain Text,A test subtitle,
123404.draft-text,2024-01-16T10:00:00.000Z,FALSE,,newsletter,everyone,Draft Post,A draft subtitle,
`;

const PUBLISHED_POST_HTML = `<h2>Hello World</h2>
<p>This is a test post with an image.</p>
<img src="https://substack-post-media.s3.amazonaws.com/public/images/test-image.jpg" alt="Test image">
<p class="button-wrapper" data-attrs='{"url":"https://example.substack.com/subscribe?","text":"Subscribe","class":null}'>
<a class="button primary" href="https://example.substack.com/subscribe?"><span>Subscribe</span></a>
</p>
<p>End of post.</p>
`;

const DRAFT_POST_HTML = `<h2>Draft Post</h2>
<p>This is a draft post.</p>
`;

describe('Substack E2E Migration', function () {
let ghostImport;
let outputDir;
let fixtureDir;
let zipPath;
let nockAssetImage;
let nockAssetOgImage;

before(async function () {
// Load the scrape fixture HTML
const scrapeFixtureHTML = await readFile(
new URL('./fixtures/substack-e2e-scrape.html', import.meta.url)
);

// Create temp directory with Substack export structure
fixtureDir = join(
new URL('.', import.meta.url).pathname,
'fixtures',
'substack-e2e-tmp'
);
const postsDir = join(fixtureDir, 'posts');
mkdirSync(postsDir, {recursive: true});

writeFileSync(join(fixtureDir, 'posts.csv'), CSV_CONTENT);
writeFileSync(join(postsDir, '123401.plain-text.html'), PUBLISHED_POST_HTML);
writeFileSync(join(postsDir, '123404.draft-text.html'), DRAFT_POST_HTML);

// Create the input ZIP
zipPath = join(fixtureDir, '..', 'substack-e2e.zip');
execSync(`zip -r ${zipPath} posts.csv posts/`, {
cwd: fixtureDir
});
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unquoted shell path in execSync zip command

Low Severity

The zipPath variable is interpolated into the execSync shell command without quotes, while the unzip command on line 144 properly quotes its path arguments. If the resolved test path contains spaces, the zip command will break because the shell will split it into multiple arguments.

Additional Locations (1)

Fix in Cursor Fix in Web


// Create a temp directory for the output ZIP
outputDir = join(fixtureDir, '..', 'substack-e2e-output');
mkdirSync(outputDir, {recursive: true});

// Disable all real HTTP
nock.disableNetConnect();

// Mock web scraper request for published post
nock('https://example.substack.com')
.get('/p/plain-text')
.reply(200, scrapeFixtureHTML, {
'Content-Type': 'text/html'
});

// Mock asset scraper requests for images
nockAssetImage = nock('https://substack-post-media.s3.amazonaws.com')
.get('/public/images/test-image.jpg')
.reply(200, JPEG_BUFFER, {
'Content-Type': 'image/jpeg'
});

nockAssetOgImage = nock('https://substack-post-media.s3.amazonaws.com')
.get('/public/images/og-image.jpg')
.reply(200, JPEG_BUFFER, {
'Content-Type': 'image/jpeg'
});

// Run the full migration pipeline with zip enabled
const options = {
pathToZip: zipPath,
url: 'https://example.substack.com',
scrape: ['all'],
wait_after_scrape: 0,
zip: true,
cache: false,
verbose: false,
renderer: 'silent',
posts: true,
drafts: true,
pages: true,
podcasts: true,
threads: false,
useMetaImage: true,
useFirstImage: true,
useMetaAuthor: true,
addPlatformTag: true,
addTypeTag: true,
addAccessTag: true,
addTag: null,
subscribeLink: '#/portal/signup',
noSubscribeButtons: false,
comments: true,
commentLink: '#ghost-comments-root',
fallBackHTMLCard: true,
postsBefore: null,
postsAfter: null,
email: null,
tmpPath: null,
outputPath: outputDir
};

const ctx = {
errors: []
};

const migrate = substackSource.getTaskRunner(options);
await migrate.run(ctx);

// Unzip the output
assert.ok(ctx.outputFile, 'should have produced an output file');
assert.ok(existsSync(ctx.outputFile.path), `output zip should exist at ${ctx.outputFile.path}`);

const unzipDir = join(outputDir, 'unzipped');
mkdirSync(unzipDir, {recursive: true});
execSync(`unzip -o "${ctx.outputFile.path}" -d "${unzipDir}"`);

// Read the Ghost import JSON from the unzipped output
const jsonPath = join(unzipDir, 'ghost-import.json');
assert.ok(existsSync(jsonPath), `ghost-import.json should exist at ${jsonPath}`);
ghostImport = JSON.parse(readFileSync(jsonPath, 'utf8'));
});

after(function () {
nock.cleanAll();
nock.enableNetConnect();

// Clean up all temp files
rmSync(fixtureDir, {recursive: true, force: true});
rmSync(zipPath, {force: true});
rmSync(outputDir, {recursive: true, force: true});
Comment on lines +152 to +159
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Guard cleanup paths so teardown does not mask setup failures.

If setup exits early, rmSync can receive undefined paths and throw, hiding the original failure. Add existence checks before cleanup calls.

🔧 Proposed fix
     after(function () {
         nock.cleanAll();
         nock.enableNetConnect();

         // Clean up all temp files
-        rmSync(fixtureDir, {recursive: true, force: true});
-        rmSync(zipPath, {force: true});
-        rmSync(outputDir, {recursive: true, force: true});
+        if (fixtureDir) {
+            rmSync(fixtureDir, {recursive: true, force: true});
+        }
+        if (zipPath) {
+            rmSync(zipPath, {force: true});
+        }
+        if (outputDir) {
+            rmSync(outputDir, {recursive: true, force: true});
+        }
     });
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
after(function () {
nock.cleanAll();
nock.enableNetConnect();
// Clean up all temp files
rmSync(fixtureDir, {recursive: true, force: true});
rmSync(zipPath, {force: true});
rmSync(outputDir, {recursive: true, force: true});
after(function () {
nock.cleanAll();
nock.enableNetConnect();
// Clean up all temp files
if (fixtureDir) {
rmSync(fixtureDir, {recursive: true, force: true});
}
if (zipPath) {
rmSync(zipPath, {force: true});
}
if (outputDir) {
rmSync(outputDir, {recursive: true, force: true});
}
});
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@packages/migrate/test/substack-e2e.test.js` around lines 152 - 159, The
teardown in the after hook calls rmSync on fixtureDir, zipPath, and outputDir
without guards, which can throw if setup failed and those variables are
undefined; update the after() cleanup to check each variable exists (and
optionally fs.existsSync(path) is true) before calling rmSync for fixtureDir,
zipPath, and outputDir (or wrap each rmSync in a safe check/try-catch) so
teardown doesn't mask setup failures.

});

it('produces valid Ghost JSON with expected post count', function () {
assert.ok(ghostImport);
assert.ok(ghostImport.data);
assert.ok(ghostImport.data.posts);
assert.equal(ghostImport.data.posts.length, 2);
});

it('has correct post metadata', function () {
const posts = ghostImport.data.posts;

const published = posts.find(p => p.slug === 'plain-text');
assert.ok(published, 'published post should exist');
assert.equal(published.title, 'Plain Text');
assert.equal(published.slug, 'plain-text');
assert.equal(published.status, 'published');
assert.equal(published.visibility, 'public');

const draft = posts.find(p => p.slug === 'draft-text');
assert.ok(draft, 'draft post should exist');
assert.equal(draft.status, 'draft');
});

it('includes web-scraped metadata on the published post', function () {
const published = ghostImport.data.posts.find(p => p.slug === 'plain-text');
const postMeta = ghostImport.data.posts_meta.find(m => m.post_id === published.id);

assert.ok(postMeta, 'published post should have posts_meta entry');
assert.equal(postMeta.meta_title, 'Plain Text - Test Author');
assert.equal(postMeta.meta_description, 'A test meta description');
assert.equal(postMeta.og_image, '__GHOST_URL__/content/images/substack-post-media-s3-amazonaws-com/public/images/og-image.jpg');
assert.equal(postMeta.og_title, 'Plain Text OG Title');
assert.equal(postMeta.og_description, 'OG description for the post');
assert.equal(postMeta.twitter_image, '__GHOST_URL__/content/images/substack-post-media-s3-amazonaws-com/public/images/og-image.jpg');
assert.equal(postMeta.twitter_title, 'Plain Text Twitter Title');
assert.equal(postMeta.twitter_description, 'Twitter description for the post');

// Draft should not have scraped metadata
const draft = ghostImport.data.posts.find(p => p.slug === 'draft-text');
const draftMeta = ghostImport.data.posts_meta.find(m => m.post_id === draft.id);
assert.ok(!draftMeta || !draftMeta.meta_title);
});

it('extracts authors from ld+json', function () {
const published = ghostImport.data.posts.find(p => p.slug === 'plain-text');
const postAuthors = ghostImport.data.posts_authors.filter(pa => pa.post_id === published.id);
const users = ghostImport.data.users;

assert.equal(postAuthors.length, 1);

const authorIds = postAuthors.map(pa => pa.author_id);
const postUsers = users.filter(u => authorIds.includes(u.id));

assert.equal(postUsers.length, 1);
assert.equal(postUsers[0].slug, 'test-author');
assert.equal(postUsers[0].name, 'Test Author');
assert.equal(postUsers[0].email, 'test-author@example.com');
});

it('extracts tags from scripts and adds platform/type/access tags', function () {
const published = ghostImport.data.posts.find(p => p.slug === 'plain-text');
const postTags = ghostImport.data.posts_tags.filter(pt => pt.post_id === published.id);
const allTags = ghostImport.data.tags;

const tagIds = postTags.map(pt => pt.tag_id);
const tagNames = allTags.filter(t => tagIds.includes(t.id)).map(t => t.name);

assert.deepEqual(tagNames, [
'Tech', // section tag from _analyticsConfig
'Testing', // post tag from _preloads
'Newsletter', // type tag from CSV
'#substack', // platform tag
'#substack-type-newsletter', // type tag
'#substack-access-everyone' // access tag
]);
});

it('converts HTML to valid Lexical JSON', function () {
const published = ghostImport.data.posts.find(p => p.slug === 'plain-text');
const lexical = JSON.parse(published.lexical);

assert.deepEqual(lexical, {
root: {
children: [
{
children: [{detail: 0, format: 0, mode: 'normal', style: '', text: 'Hello World', type: 'extended-text', version: 1}],
direction: null, format: '', indent: 0, type: 'extended-heading', version: 1, tag: 'h2'
},
{
children: [{detail: 0, format: 0, mode: 'normal', style: '', text: 'This is a test post with an image.', type: 'extended-text', version: 1}],
direction: null, format: '', indent: 0, type: 'paragraph', version: 1
},
{
type: 'image', version: 1,
src: '__GHOST_URL__/content/images/substack-post-media-s3-amazonaws-com/public/images/test-image.jpg',
width: null, height: null, title: '', alt: 'Test image', caption: '', cardWidth: 'regular', href: ''
},
{
type: 'button', version: 1,
buttonText: 'Subscribe', alignment: 'center', buttonUrl: '#/portal/signup'
},
{
children: [{detail: 0, format: 0, mode: 'normal', style: '', text: 'End of post.', type: 'extended-text', version: 1}],
direction: null, format: '', indent: 0, type: 'paragraph', version: 1
}
],
direction: null, format: '', indent: 0, type: 'root', version: 1
}
});
});

it('downloads assets and includes them in the output zip', function () {
// Verify nock mocks were called
assert.ok(nockAssetImage.isDone(), 'test-image.jpg should have been fetched');
assert.ok(nockAssetOgImage.isDone(), 'og-image.jpg should have been fetched');

// Check that image files exist in the unzipped output
const unzipDir = join(outputDir, 'unzipped');
const imagePath = join(unzipDir, 'content', 'images', 'substack-post-media-s3-amazonaws-com', 'public', 'images', 'test-image.jpg');
const ogImagePath = join(unzipDir, 'content', 'images', 'substack-post-media-s3-amazonaws-com', 'public', 'images', 'og-image.jpg');

assert.ok(existsSync(imagePath), `test-image.jpg should exist in unzipped output at ${imagePath}`);
assert.ok(existsSync(ogImagePath), `og-image.jpg should exist in unzipped output at ${ogImagePath}`);
});

it('processes subscribe buttons', function () {
const published = ghostImport.data.posts.find(p => p.slug === 'plain-text');

// The original subscribe button URL should have been replaced
assert.ok(
!published.lexical.includes('https://example.substack.com/subscribe?'),
'subscribe button URL should be rewritten'
);
});
});