mirror of
https://github.com/oven-sh/bun
synced 2026-02-02 15:08:46 +00:00
Add a couple HTMLRewriter guides
This commit is contained in:
68
docs/guides/html-rewriter/extract-links.md
Normal file
68
docs/guides/html-rewriter/extract-links.md
Normal file
@@ -0,0 +1,68 @@
|
||||
---
|
||||
name: Extract links from a webpage using HTMLRewriter
|
||||
---
|
||||
|
||||
## Extract links from a webpage
|
||||
|
||||
Bun's [HTMLRewriter](https://bun.sh/docs/api/html-rewriter) API can be used to efficiently extract links from HTML content. It works by chaining together CSS selectors to match the elements, text, and attributes you want to process. This is a simple example of how to extract links from a webpage. You can pass `.transform` a `Response`, `Blob`, or `string`.
|
||||
|
||||
```ts
|
||||
async function extractLinks(url: string) {
|
||||
const links = new Set<string>();
|
||||
const response = await fetch(url);
|
||||
|
||||
const rewriter = new HTMLRewriter().on("a[href]", {
|
||||
element(el) {
|
||||
const href = el.getAttribute("href");
|
||||
if (href) {
|
||||
links.add(href);
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
// Wait for the response to be processed
|
||||
await rewriter.transform(response).blob();
|
||||
console.log([...links]); // ["https://bun.sh", "/docs", ...]
|
||||
}
|
||||
|
||||
// Extract all links from the Bun website
|
||||
await extractLinks("https://bun.sh");
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Convert relative URLs to absolute
|
||||
|
||||
When scraping websites, you often want to convert relative URLs (like `/docs`) to absolute URLs. Here's how to handle URL resolution:
|
||||
|
||||
```ts
|
||||
async function extractLinksFromURL(url: string) {
|
||||
const response = await fetch(url);
|
||||
const links = new Set<string>();
|
||||
|
||||
const rewriter = new HTMLRewriter().on("a[href]", {
|
||||
element(el) {
|
||||
const href = el.getAttribute("href");
|
||||
if (href) {
|
||||
// Convert relative URLs to absolute
|
||||
try {
|
||||
const absoluteURL = new URL(href, url).href;
|
||||
links.add(absoluteURL);
|
||||
} catch {
|
||||
links.add(href);
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
// Wait for the response to be processed
|
||||
await rewriter.transform(response).blob();
|
||||
return [...links];
|
||||
}
|
||||
|
||||
const websiteLinks = await extractLinksFromURL("https://example.com");
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
See [Docs > API > HTMLRewriter](https://bun.sh/docs/api/html-rewriter) for complete documentation on HTML transformation with Bun.
|
||||
93
docs/guides/html-rewriter/extract-social-meta.md
Normal file
93
docs/guides/html-rewriter/extract-social-meta.md
Normal file
@@ -0,0 +1,93 @@
|
||||
---
|
||||
name: Extract social share images and Open Graph tags
|
||||
---
|
||||
|
||||
## Extract social share images and Open Graph tags
|
||||
|
||||
Bun's [HTMLRewriter](https://bun.sh/docs/api/html-rewriter) API can be used to efficiently extract social share images and Open Graph metadata from HTML content. This is particularly useful for building link preview features, social media cards, or web scrapers. We can use HTMLRewriter to match CSS selectors to HTML elements, text, and attributes we want to process.
|
||||
|
||||
```ts
|
||||
interface SocialMetadata {
|
||||
title?: string;
|
||||
description?: string;
|
||||
image?: string;
|
||||
url?: string;
|
||||
siteName?: string;
|
||||
type?: string;
|
||||
}
|
||||
|
||||
async function extractSocialMetadata(url: string): Promise<SocialMetadata> {
|
||||
const metadata: SocialMetadata = {};
|
||||
const response = await fetch(url);
|
||||
|
||||
const rewriter = new HTMLRewriter()
|
||||
// Extract Open Graph meta tags
|
||||
.on('meta[property^="og:"]', {
|
||||
element(el) {
|
||||
const property = el.getAttribute("property");
|
||||
const content = el.getAttribute("content");
|
||||
if (property && content) {
|
||||
// Convert "og:image" to "image" etc.
|
||||
const key = property.replace("og:", "") as keyof SocialMetadata;
|
||||
metadata[key] = content;
|
||||
}
|
||||
},
|
||||
})
|
||||
// Extract Twitter Card meta tags as fallback
|
||||
.on('meta[name^="twitter:"]', {
|
||||
element(el) {
|
||||
const name = el.getAttribute("name");
|
||||
const content = el.getAttribute("content");
|
||||
if (name && content) {
|
||||
const key = name.replace("twitter:", "") as keyof SocialMetadata;
|
||||
// Only use Twitter Card data if we don't have OG data
|
||||
if (!metadata[key]) {
|
||||
metadata[key] = content;
|
||||
}
|
||||
}
|
||||
},
|
||||
})
|
||||
// Fallback to regular meta tags
|
||||
.on('meta[name="description"]', {
|
||||
element(el) {
|
||||
const content = el.getAttribute("content");
|
||||
if (content && !metadata.description) {
|
||||
metadata.description = content;
|
||||
}
|
||||
},
|
||||
})
|
||||
// Fallback to title tag
|
||||
.on("title", {
|
||||
text(text) {
|
||||
if (!metadata.title) {
|
||||
metadata.title = text.text;
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
// Process the response
|
||||
await rewriter.transform(response).blob();
|
||||
|
||||
// Convert relative image URLs to absolute
|
||||
if (metadata.image && !metadata.image.startsWith("http")) {
|
||||
try {
|
||||
metadata.image = new URL(metadata.image, url).href;
|
||||
} catch {
|
||||
// Keep the original URL if parsing fails
|
||||
}
|
||||
}
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
// Example usage
|
||||
const metadata = await extractSocialMetadata("https://bun.sh");
|
||||
console.log(metadata);
|
||||
// {
|
||||
// title: "Bun — A fast all-in-one JavaScript runtime",
|
||||
// description: "Bundle, transpile, install and run JavaScript & TypeScript projects — all in Bun. Bun is a fast all-in-one JavaScript runtime & toolkit designed for speed, complete with a bundler, test runner, and Node.js-compatible package manager.",
|
||||
// image: "https://bun.sh/share.jpg",
|
||||
// type: "website",
|
||||
// ...
|
||||
// }
|
||||
```
|
||||
4
docs/guides/html-rewriter/index.json
Normal file
4
docs/guides/html-rewriter/index.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"name": "HTMLRewriter",
|
||||
"description": "A collection of guides for using the HTMLRewriter streaming HTML parser with Bun"
|
||||
}
|
||||
Reference in New Issue
Block a user