138 lines
4.3 KiB
TypeScript
138 lines
4.3 KiB
TypeScript
import puppeteer from 'puppeteer'
|
|
import type { ScrapedContent } from './types'
|
|
|
|
export class ScrapingError extends Error {
|
|
constructor(message: string, public code: string) {
|
|
super(message)
|
|
this.name = 'ScrapingError'
|
|
}
|
|
}
|
|
|
|
export async function scrapeWebsite(url: string): Promise<ScrapedContent> {
|
|
let validatedUrl = url
|
|
if (!url.startsWith('http')) {
|
|
validatedUrl = `https://${url}`
|
|
}
|
|
|
|
try {
|
|
new URL(validatedUrl)
|
|
} catch {
|
|
throw new ScrapingError('Invalid URL format. Please enter a valid website URL.', 'INVALID_URL')
|
|
}
|
|
|
|
let browser
|
|
try {
|
|
browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
})
|
|
|
|
const page = await browser.newPage()
|
|
await page.setViewport({ width: 1920, height: 1080 })
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
|
|
await page.goto(validatedUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 30000
|
|
})
|
|
|
|
const extractedContent = await page.evaluate(() => {
|
|
const title = document.title || document.querySelector('h1')?.textContent || ''
|
|
|
|
const metaDesc = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
|
|
document.querySelector('meta[property="og:description"]')?.getAttribute('content') || ''
|
|
|
|
const headings: string[] = []
|
|
document.querySelectorAll('h1, h2, h3').forEach(el => {
|
|
const text = el.textContent?.trim()
|
|
if (text && text.length > 5 && text.length < 200) headings.push(text)
|
|
})
|
|
|
|
const paragraphs: string[] = []
|
|
document.querySelectorAll('p').forEach(el => {
|
|
const text = el.textContent?.trim()
|
|
if (text && text.length > 30 && text.length < 500 && !text.includes('{')) {
|
|
paragraphs.push(text)
|
|
}
|
|
})
|
|
|
|
const featureList: string[] = []
|
|
document.querySelectorAll('ul li, ol li').forEach(el => {
|
|
const text = el.textContent?.trim()
|
|
if (text && text.length > 10 && text.length < 200) featureList.push(text)
|
|
})
|
|
|
|
const links: string[] = []
|
|
document.querySelectorAll('a[href^="/"], a[href^="./"]').forEach(el => {
|
|
const text = el.textContent?.trim()
|
|
if (text && text.length > 3 && text.length < 50) links.push(text)
|
|
})
|
|
|
|
// Get all visible text for raw analysis
|
|
const bodyText = document.body.innerText || ''
|
|
|
|
return {
|
|
title,
|
|
metaDescription: metaDesc,
|
|
headings,
|
|
paragraphs,
|
|
featureList,
|
|
links,
|
|
rawText: bodyText
|
|
}
|
|
})
|
|
|
|
return {
|
|
url: validatedUrl,
|
|
title: extractedContent.title,
|
|
metaDescription: extractedContent.metaDescription,
|
|
headings: [...new Set(extractedContent.headings)].slice(0, 20),
|
|
paragraphs: [...new Set(extractedContent.paragraphs)].slice(0, 30),
|
|
featureList: [...new Set(extractedContent.featureList)].slice(0, 20),
|
|
links: [...new Set(extractedContent.links)].slice(0, 15),
|
|
rawText: extractedContent.rawText.slice(0, 10000) // Limit raw text
|
|
}
|
|
|
|
} catch (error: any) {
|
|
console.error('Scraping error:', error)
|
|
|
|
if (error.message?.includes('ERR_NAME_NOT_RESOLVED') || error.message?.includes('net::ERR')) {
|
|
throw new ScrapingError(
|
|
`Could not reach ${validatedUrl}. Please check the URL or try entering your product description manually.`,
|
|
'DNS_ERROR'
|
|
)
|
|
}
|
|
|
|
if (error.message?.includes('timeout')) {
|
|
throw new ScrapingError(
|
|
'The website took too long to respond. Please try again or enter your product description manually.',
|
|
'TIMEOUT'
|
|
)
|
|
}
|
|
|
|
throw new ScrapingError(
|
|
'Failed to scrape the website. Please try again or enter your product description manually.',
|
|
'UNKNOWN'
|
|
)
|
|
} finally {
|
|
if (browser) await browser.close()
|
|
}
|
|
}
|
|
|
|
export async function analyzeFromText(
|
|
productName: string,
|
|
description: string,
|
|
features: string
|
|
): Promise<ScrapedContent> {
|
|
return {
|
|
url: 'manual-input',
|
|
title: productName,
|
|
metaDescription: description,
|
|
headings: [productName, 'Features', 'Benefits'],
|
|
paragraphs: [description, features],
|
|
featureList: features.split('\n').filter(f => f.trim()),
|
|
links: [],
|
|
rawText: `${productName}\n\n${description}\n\n${features}`
|
|
}
|
|
}
|