Sports Data Extraction Tool

Comprehensive Flashscore.com scraper built with Playwright and Cheerio. Collects match data, lineups, statistics and schedules.

Playwright Cheerio CRON Scheduling Stealth Mode

Project Structure

src/scrapers/base-scraper.js
JavaScript
const { chromium } = require('playwright');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')();
const UserAgent = require('user-agents');
const ProxyManager = require('../utils/proxy-manager');

class BaseScraper {
    constructor() {
        this.proxyManager = new ProxyManager();
        this.stealthPlugin = StealthPlugin;
        this.userAgent = new UserAgent();
    }

    async launchBrowser() {
        const proxy = this.proxyManager.getNextProxy();
        this.browser = await chromium.launch({
            headless: true,
            proxy: {
                server: proxy,
            },
            args: [
                '--disable-blink-features=AutomationControlled',
                '--no-sandbox'
            ],
        });

        this.context = await this.browser.newContext({
            userAgent: this.userAgent.toString(),
            viewport: { width: 1920, height: 1080 },
        });

        this.page = await this.context.newPage();
        
        // Block unnecessary resources
        await this.page.route(/\.(jpg|jpeg|png|gif|css|ads|adservice|googleadservices|doubleclick)/, route => route.abort());
        
        // Enable stealth
        await this.stealthPlugin.onPageCreated(this.page);
    }

    async navigateTo(url) {
        await this.page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
        await this.randomDelay(2000, 8000);
    }

    async randomDelay(min, max) {
        const delay = Math.floor(Math.random() * (max - min + 1)) + min;
        await this.page.waitForTimeout(delay);
    }

    async closeBrowser() {
        await this.browser.close();
    }
}

module.exports = BaseScraper;
src/utils/browser-manager.js
JavaScript
const { chromium } = require('playwright');
const singleton = Symbol();
const singletonEnforcer = Symbol();

class BrowserManager {
    constructor(enforcer) {
        if (enforcer !== singletonEnforcer) {
            throw new Error('Cannot construct singleton');
        }
        this.browser = null;
    }

    static get instance() {
        if (!this[singleton]) {
            this[singleton] = new BrowserManager(singletonEnforcer);
        }
        return this[singleton];
    }

    async launch() {
        if (!this.browser || !this.browser.isConnected()) {
            this.browser = await chromium.launch({
                headless: true,
                args: [
                    '--disable-blink-features=AutomationControlled',
                    '--no-sandbox'
                ],
            });
        }
        return this.browser;
    }

    async newContext() {
        const browser = await this.launch();
        return browser.newContext({
            viewport: { width: 1920, height: 1080 },
            userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
        });
    }

    async close() {
        if (this.browser) {
            await this.browser.close();
            this.browser = null;
        }
    }
}

module.exports = BrowserManager;
src/scrapers/match-summary.js
JavaScript
const BaseScraper = require('./base-scraper');
const cheerio = require('cheerio');
const MatchData = require('../models/match-data');
const DataProcessor = require('../utils/data-processor');

class MatchSummaryScraper extends BaseScraper {
    constructor(matchUrl) {
        super();
        this.matchUrl = matchUrl;
        this.matchData = new MatchData();
    }

    async scrape() {
        try {
            await this.launchBrowser();
            await this.navigateTo(this.matchUrl);
            await this.page.waitForSelector('.matchSummary', { timeout: 10000 });
            
            const html = await this.page.content();
            this.matchData = this.parseMatchSummary(html);
            await this.closeBrowser();
            return this.matchData;
        } catch (error) {
            console.error('Error scraping match summary:', error);
            await this.closeBrowser();
            throw error;
        }
    }

    parseMatchSummary(html) {
        const $ = cheerio.load(html);
        const match = new MatchData();
        
        // Parse teams and scores
        match.homeTeam = $('.home-team-name').text().trim();
        match.awayTeam = $('.away-team-name').text().trim();
        match.score = $('.score').text().trim();
        match.halfTimeScore = $('.half-time-score').text().trim();
        
        // Parse date and time
        match.date = $('.match-date').attr('data-date');
        match.time = $('.match-time').attr('data-time');
        
        // Parse match events
        $('.event-row').each((i, element) => {
            const event = {
                type: $(element).find('.event-type').text().trim(),
                time: $(element).find('.event-time').text().trim(),
                player: $(element).find('.event-player').text().trim(),
                team: $(element).attr('class').includes('home') ? 'home' : 'away'
            };
            
            match.events.push(event);
        });
        
        // Data normalization
        match.date = DataProcessor.normalizeDate(match.date);
        match.events = DataProcessor.normalizeEvents(match.events);
        
        return match;
    }
}

module.exports = MatchSummaryScraper;
src/models/match-data.js
JavaScript
class MatchData {
    constructor() {
        this.id = '';
        this.homeTeam = '';
        this.awayTeam = '';
        this.competition = '';
        this.status = '';
        this.date = '';
        this.time = '';
        this.score = '';
        this.halfTimeScore = '';
        this.venue = '';
        this.attendance = 0;
        this.referee = '';
        this.events = [];
        this.statistics = {};
        this.lastUpdated = new Date();
    }
    
    addEvent(event) {
        this.events.push(event);
    }
    
    addStatistic(type, value) {
        this.statistics[type] = value;
    }
    
    toJSON() {
        return {
            id: this.id,
            homeTeam: this.homeTeam,
            awayTeam: this.awayTeam,
            competition: this.competition,
            status: this.status,
            date: this.date,
            time: this.time,
            score: this.score,
            halfTimeScore: this.halfTimeScore,
            venue: this.venue,
            attendance: this.attendance,
            referee: this.referee,
            events: this.events,
            statistics: this.statistics,
            lastUpdated: this.lastUpdated.toISOString()
        };
    }
}

module.exports = MatchData;

Project Features

Stealth Mode

Avoid detection with advanced techniques like randomized user agents, request delays, and proxy rotation.

Scheduled Scraping

Regularly collect data using cron scheduling and automated retries with exponential backoff.

Modular Architecture

Clean separation of concerns with independent modules for scraping, data processing, and utilities.

Installation & Usage

1

Initialize Project

# Create project directory
mkdir flashscore-scraper
cd flashscore-scraper

# Initialize npm project
npm init -y

# Install dependencies
npm install playwright cheerio axios dotenv fs-extra node-cron
2

Configure Environment

Create a .env file with your configuration:

# Proxy configuration
PROXY_SERVERS="http://user:pass@proxy1.com:8080,http://user:pass@proxy2.com:8080"

# Flashscore base URL
BASE_URL="https://www.flashscore.com"

# Schedule - every day at midnight
CRON_SCHEDULE="0 0 * * *"

# Output directory
DATA_DIR="./data"
3

Run Scraper

Execute the main script directly or set up a cron job:

# Run once for a specific match
node src/index.js --matchId "123456"

# Or run on a schedule according to settings.js
node src/index.js --cron

Made with DeepSite LogoDeepSite - 🧬 Remix