SanatiLeads/lib/scraper.ts

import puppeteer from 'puppeteer'
import type { ScrapedContent } from './types'

export class ScrapingError extends Error {
  constructor(message: string, public code: string) {
    super(message)
    this.name = 'ScrapingError'
  }
}

export async function scrapeWebsite(url: string): Promise<ScrapedContent> {
  let validatedUrl = url
  if (!url.startsWith('http')) {
    validatedUrl = `https://${url}`
  }

  try {
    new URL(validatedUrl)
  } catch {
    throw new ScrapingError('Invalid URL format. Please enter a valid website URL.', 'INVALID_URL')
  }

  let browser
  try {
    browser = await puppeteer.launch({
      headless: 'new',
      args: ['--no-sandbox', '--disable-setuid-sandbox']
    })

    const page = await browser.newPage()
    await page.setViewport({ width: 1920, height: 1080 })
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

    await page.goto(validatedUrl, {
      waitUntil: 'networkidle2',
      timeout: 30000
    })

    const extractedContent = await page.evaluate(() => {
      const title = document.title || document.querySelector('h1')?.textContent || ''

      const metaDesc = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
                       document.querySelector('meta[property="og:description"]')?.getAttribute('content') || ''

      const headings: string[] = []
      document.querySelectorAll('h1, h2, h3').forEach(el => {
        const text = el.textContent?.trim()
        if (text && text.length > 5 && text.length < 200) headings.push(text)
      })

      const paragraphs: string[] = []
      document.querySelectorAll('p').forEach(el => {
        const text = el.textContent?.trim()
        if (text && text.length > 30 && text.length < 500 && !text.includes('{')) {
          paragraphs.push(text)
        }
      })

      const featureList: string[] = []
      document.querySelectorAll('ul li, ol li').forEach(el => {
        const text = el.textContent?.trim()
        if (text && text.length > 10 && text.length < 200) featureList.push(text)
      })

      const links: string[] = []
      document.querySelectorAll('a[href^="/"], a[href^="./"]').forEach(el => {
        const text = el.textContent?.trim()
        if (text && text.length > 3 && text.length < 50) links.push(text)
      })

      // Get all visible text for raw analysis
      const bodyText = document.body.innerText || ''

      return {
        title,
        metaDescription: metaDesc,
        headings,
        paragraphs,
        featureList,
        links,
        rawText: bodyText
      }
    })

    return {
      url: validatedUrl,
      title: extractedContent.title,
      metaDescription: extractedContent.metaDescription,
      headings: [...new Set(extractedContent.headings)].slice(0, 20),
      paragraphs: [...new Set(extractedContent.paragraphs)].slice(0, 30),
      featureList: [...new Set(extractedContent.featureList)].slice(0, 20),
      links: [...new Set(extractedContent.links)].slice(0, 15),
      rawText: extractedContent.rawText.slice(0, 10000) // Limit raw text
    }

  } catch (error: any) {
    console.error('Scraping error:', error)

    if (error.message?.includes('ERR_NAME_NOT_RESOLVED') || error.message?.includes('net::ERR')) {
      throw new ScrapingError(
        `Could not reach ${validatedUrl}. Please check the URL or try entering your product description manually.`,
        'DNS_ERROR'
      )
    }

    if (error.message?.includes('timeout')) {
      throw new ScrapingError(
        'The website took too long to respond. Please try again or enter your product description manually.',
        'TIMEOUT'
      )
    }

    throw new ScrapingError(
      'Failed to scrape the website. Please try again or enter your product description manually.',
      'UNKNOWN'
    )
  } finally {
    if (browser) await browser.close()
  }
}

export async function analyzeFromText(
  productName: string,
  description: string,
  features: string
): Promise<ScrapedContent> {
  return {
    url: 'manual-input',
    title: productName,
    metaDescription: description,
    headings: [productName, 'Features', 'Benefits'],
    paragraphs: [description, features],
    featureList: features.split('\n').filter(f => f.trim()),
    links: [],
    rawText: `${productName}\n\n${description}\n\n${features}`
  }
}