initialised repo
This commit is contained in:
137
lib/scraper.ts
Normal file
137
lib/scraper.ts
Normal file
@@ -0,0 +1,137 @@
|
||||
import puppeteer from 'puppeteer'
|
||||
import type { ScrapedContent } from './types'
|
||||
|
||||
export class ScrapingError extends Error {
|
||||
constructor(message: string, public code: string) {
|
||||
super(message)
|
||||
this.name = 'ScrapingError'
|
||||
}
|
||||
}
|
||||
|
||||
export async function scrapeWebsite(url: string): Promise<ScrapedContent> {
|
||||
let validatedUrl = url
|
||||
if (!url.startsWith('http')) {
|
||||
validatedUrl = `https://${url}`
|
||||
}
|
||||
|
||||
try {
|
||||
new URL(validatedUrl)
|
||||
} catch {
|
||||
throw new ScrapingError('Invalid URL format. Please enter a valid website URL.', 'INVALID_URL')
|
||||
}
|
||||
|
||||
let browser
|
||||
try {
|
||||
browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
})
|
||||
|
||||
const page = await browser.newPage()
|
||||
await page.setViewport({ width: 1920, height: 1080 })
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
||||
|
||||
await page.goto(validatedUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000
|
||||
})
|
||||
|
||||
const extractedContent = await page.evaluate(() => {
|
||||
const title = document.title || document.querySelector('h1')?.textContent || ''
|
||||
|
||||
const metaDesc = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
|
||||
document.querySelector('meta[property="og:description"]')?.getAttribute('content') || ''
|
||||
|
||||
const headings: string[] = []
|
||||
document.querySelectorAll('h1, h2, h3').forEach(el => {
|
||||
const text = el.textContent?.trim()
|
||||
if (text && text.length > 5 && text.length < 200) headings.push(text)
|
||||
})
|
||||
|
||||
const paragraphs: string[] = []
|
||||
document.querySelectorAll('p').forEach(el => {
|
||||
const text = el.textContent?.trim()
|
||||
if (text && text.length > 30 && text.length < 500 && !text.includes('{')) {
|
||||
paragraphs.push(text)
|
||||
}
|
||||
})
|
||||
|
||||
const featureList: string[] = []
|
||||
document.querySelectorAll('ul li, ol li').forEach(el => {
|
||||
const text = el.textContent?.trim()
|
||||
if (text && text.length > 10 && text.length < 200) featureList.push(text)
|
||||
})
|
||||
|
||||
const links: string[] = []
|
||||
document.querySelectorAll('a[href^="/"], a[href^="./"]').forEach(el => {
|
||||
const text = el.textContent?.trim()
|
||||
if (text && text.length > 3 && text.length < 50) links.push(text)
|
||||
})
|
||||
|
||||
// Get all visible text for raw analysis
|
||||
const bodyText = document.body.innerText || ''
|
||||
|
||||
return {
|
||||
title,
|
||||
metaDescription: metaDesc,
|
||||
headings,
|
||||
paragraphs,
|
||||
featureList,
|
||||
links,
|
||||
rawText: bodyText
|
||||
}
|
||||
})
|
||||
|
||||
return {
|
||||
url: validatedUrl,
|
||||
title: extractedContent.title,
|
||||
metaDescription: extractedContent.metaDescription,
|
||||
headings: [...new Set(extractedContent.headings)].slice(0, 20),
|
||||
paragraphs: [...new Set(extractedContent.paragraphs)].slice(0, 30),
|
||||
featureList: [...new Set(extractedContent.featureList)].slice(0, 20),
|
||||
links: [...new Set(extractedContent.links)].slice(0, 15),
|
||||
rawText: extractedContent.rawText.slice(0, 10000) // Limit raw text
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('Scraping error:', error)
|
||||
|
||||
if (error.message?.includes('ERR_NAME_NOT_RESOLVED') || error.message?.includes('net::ERR')) {
|
||||
throw new ScrapingError(
|
||||
`Could not reach ${validatedUrl}. Please check the URL or try entering your product description manually.`,
|
||||
'DNS_ERROR'
|
||||
)
|
||||
}
|
||||
|
||||
if (error.message?.includes('timeout')) {
|
||||
throw new ScrapingError(
|
||||
'The website took too long to respond. Please try again or enter your product description manually.',
|
||||
'TIMEOUT'
|
||||
)
|
||||
}
|
||||
|
||||
throw new ScrapingError(
|
||||
'Failed to scrape the website. Please try again or enter your product description manually.',
|
||||
'UNKNOWN'
|
||||
)
|
||||
} finally {
|
||||
if (browser) await browser.close()
|
||||
}
|
||||
}
|
||||
|
||||
export async function analyzeFromText(
|
||||
productName: string,
|
||||
description: string,
|
||||
features: string
|
||||
): Promise<ScrapedContent> {
|
||||
return {
|
||||
url: 'manual-input',
|
||||
title: productName,
|
||||
metaDescription: description,
|
||||
headings: [productName, 'Features', 'Benefits'],
|
||||
paragraphs: [description, features],
|
||||
featureList: features.split('\n').filter(f => f.trim()),
|
||||
links: [],
|
||||
rawText: `${productName}\n\n${description}\n\n${features}`
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user