Files
SanatiLeads/lib/scraper.ts
2026-02-02 15:58:45 +00:00

138 lines
4.3 KiB
TypeScript

import puppeteer from 'puppeteer'
import type { ScrapedContent } from './types'
export class ScrapingError extends Error {
constructor(message: string, public code: string) {
super(message)
this.name = 'ScrapingError'
}
}
export async function scrapeWebsite(url: string): Promise<ScrapedContent> {
let validatedUrl = url
if (!url.startsWith('http')) {
validatedUrl = `https://${url}`
}
try {
new URL(validatedUrl)
} catch {
throw new ScrapingError('Invalid URL format. Please enter a valid website URL.', 'INVALID_URL')
}
let browser
try {
browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox']
})
const page = await browser.newPage()
await page.setViewport({ width: 1920, height: 1080 })
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
await page.goto(validatedUrl, {
waitUntil: 'networkidle2',
timeout: 30000
})
const extractedContent = await page.evaluate(() => {
const title = document.title || document.querySelector('h1')?.textContent || ''
const metaDesc = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
document.querySelector('meta[property="og:description"]')?.getAttribute('content') || ''
const headings: string[] = []
document.querySelectorAll('h1, h2, h3').forEach(el => {
const text = el.textContent?.trim()
if (text && text.length > 5 && text.length < 200) headings.push(text)
})
const paragraphs: string[] = []
document.querySelectorAll('p').forEach(el => {
const text = el.textContent?.trim()
if (text && text.length > 30 && text.length < 500 && !text.includes('{')) {
paragraphs.push(text)
}
})
const featureList: string[] = []
document.querySelectorAll('ul li, ol li').forEach(el => {
const text = el.textContent?.trim()
if (text && text.length > 10 && text.length < 200) featureList.push(text)
})
const links: string[] = []
document.querySelectorAll('a[href^="/"], a[href^="./"]').forEach(el => {
const text = el.textContent?.trim()
if (text && text.length > 3 && text.length < 50) links.push(text)
})
// Get all visible text for raw analysis
const bodyText = document.body.innerText || ''
return {
title,
metaDescription: metaDesc,
headings,
paragraphs,
featureList,
links,
rawText: bodyText
}
})
return {
url: validatedUrl,
title: extractedContent.title,
metaDescription: extractedContent.metaDescription,
headings: [...new Set(extractedContent.headings)].slice(0, 20),
paragraphs: [...new Set(extractedContent.paragraphs)].slice(0, 30),
featureList: [...new Set(extractedContent.featureList)].slice(0, 20),
links: [...new Set(extractedContent.links)].slice(0, 15),
rawText: extractedContent.rawText.slice(0, 10000) // Limit raw text
}
} catch (error: any) {
console.error('Scraping error:', error)
if (error.message?.includes('ERR_NAME_NOT_RESOLVED') || error.message?.includes('net::ERR')) {
throw new ScrapingError(
`Could not reach ${validatedUrl}. Please check the URL or try entering your product description manually.`,
'DNS_ERROR'
)
}
if (error.message?.includes('timeout')) {
throw new ScrapingError(
'The website took too long to respond. Please try again or enter your product description manually.',
'TIMEOUT'
)
}
throw new ScrapingError(
'Failed to scrape the website. Please try again or enter your product description manually.',
'UNKNOWN'
)
} finally {
if (browser) await browser.close()
}
}
export async function analyzeFromText(
productName: string,
description: string,
features: string
): Promise<ScrapedContent> {
return {
url: 'manual-input',
title: productName,
metaDescription: description,
headings: [productName, 'Features', 'Benefits'],
paragraphs: [description, features],
featureList: features.split('\n').filter(f => f.trim()),
links: [],
rawText: `${productName}\n\n${description}\n\n${features}`
}
}