- Fix relative imports in experiments/ scripts (../ → ../../) - Clean up tsconfig.json exclude list (remove non-existent paths) - All debug scripts now work from their new location Co-Authored-By: Claude <noreply@anthropic.com>
117 lines
3.9 KiB
TypeScript
117 lines
3.9 KiB
TypeScript
import 'dotenv/config';
|
||
import { chromium } from 'playwright';
|
||
import * as fs from 'fs';
|
||
import { Logger } from '../../utils/logger.js';
|
||
|
||
async function main() {
|
||
Logger.info('=== Извлечение данных о товаре из HTML ===\n');
|
||
|
||
const browser = await chromium.launch({ headless: true });
|
||
const context = await browser.newContext();
|
||
const page = await context.newPage();
|
||
|
||
const productUrl = 'https://magnit.ru/product/1000233138-podguzniki_la_fresh_dlya_vzroslykh_l_10sht?shopCode=992301&shopType=6';
|
||
|
||
Logger.info(`Загружаю страницу: ${productUrl}`);
|
||
|
||
await page.goto(productUrl, {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 20000,
|
||
});
|
||
|
||
await page.waitForTimeout(3000);
|
||
|
||
// Извлекаем данные из HTML
|
||
const productData = await page.evaluate(() => {
|
||
const result: any = {
|
||
title: document.querySelector('h1')?.textContent?.trim() || '',
|
||
// Ищем brand, description, weight в разных местах
|
||
};
|
||
|
||
// 1. Ищем в meta тегах
|
||
const metaBrand = document.querySelector('meta[itemprop="brand"]')?.content;
|
||
const metaDesc = document.querySelector('meta[itemprop="description"]')?.content;
|
||
const metaWeight = document.querySelector('meta[itemprop="weight"]')?.content;
|
||
|
||
// 2. Ищем в JSON-LD structured data
|
||
const jsonLdScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]'));
|
||
for (const script of jsonLdScripts) {
|
||
try {
|
||
const json = JSON.parse(script.textContent || '');
|
||
if (json['@type'] === 'Product' || json.name === 'Product') {
|
||
result.jsonLd = json;
|
||
break;
|
||
}
|
||
} catch (e) {}
|
||
}
|
||
|
||
// 3. Ищем в window объектах
|
||
const nuxtData = (window as any).__NUXT__;
|
||
if (nuxtData) {
|
||
result.nuxtKeys = Object.keys(nuxtData);
|
||
// Проверяем все возможные места с данными о товаре
|
||
for (const key of Object.keys(nuxtData)) {
|
||
const val = nuxtData[key];
|
||
if (val && typeof val === 'object') {
|
||
const str = JSON.stringify(val);
|
||
if (str.includes('brand') || str.includes('description') || str.includes('weight')) {
|
||
result.nuxtDataKey = key;
|
||
result.nuxtDataPreview = str.substring(0, 500);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 4. Ищем в других script тегах
|
||
const allScripts = Array.from(document.querySelectorAll('script'));
|
||
for (const script of allScripts) {
|
||
const text = script.textContent || '';
|
||
if (text.includes('"brand"') && text.length > 100 && text.length < 100000) {
|
||
try {
|
||
// Попробуем найти JSON
|
||
const match = text.match(/\{[\s\S]*\}/);
|
||
if (match) {
|
||
try {
|
||
const json = JSON.parse(match[0]);
|
||
if (json.brand || json.description || json.weight) {
|
||
result.foundInScript = true;
|
||
result.scriptDataPreview = JSON.stringify(json).substring(0, 500);
|
||
break;
|
||
}
|
||
} catch (e2) {}
|
||
}
|
||
} catch (e) {}
|
||
}
|
||
}
|
||
|
||
// 5. Ищем в data-атрибутах
|
||
const productElement = document.querySelector('[data-product-id], [data-product], [id*="product"]');
|
||
if (productElement) {
|
||
result.productElement = productElement.outerHTML.substring(0, 500);
|
||
}
|
||
|
||
// 6. Проверяем структурированные данные
|
||
result.structuredData = {
|
||
metaBrand,
|
||
metaDesc,
|
||
metaWeight,
|
||
};
|
||
|
||
return result;
|
||
});
|
||
|
||
Logger.info('=== РЕЗУЛЬТАТЫ ===\n');
|
||
Logger.info(JSON.stringify(productData, null, 2));
|
||
|
||
// Также сохраним HTML для анализа
|
||
const html = await page.content();
|
||
const outputPath = 'temp-product-page.html';
|
||
fs.writeFileSync(outputPath, html, 'utf-8');
|
||
Logger.info(`\nHTML сохранен в: ${outputPath}`);
|
||
|
||
await browser.close();
|
||
}
|
||
|
||
main();
|