数据抓取

基于 Nodejs 请求数据解析,返回api。

接口测试:http://test.52shujun.cn/

1. 安装库

npm i cheerio express superagent -S

2. app.js

const express = require('express')
const app = express()
const { getTrending, getNationality, getLanguage, getDate } = require('./util')

// 国家
app.get('/nationality', async (req, res) => {
  const data = await getNationality().catch((err) => {
    res.json({
      code: -1,
      err,
      msg: '抓取数据失败',
    })
  })
  res.json({
    code: 200,
    item: data,
    msg: '请求成功',
  })
})

// 编程语言
app.get('/language', async (req, res) => {
  const data = await getLanguage().catch((err) => {
    res.json({
      code: -1,
      err,
      msg: '抓取数据失败',
    })
  })
  res.json({
    code: 200,
    item: data,
    msg: '请求成功',
  })
})

// 时间
app.get('/date', async (req, res) => {
  const data = await getDate().catch((err) => {
    res.json({
      code: -1,
      err,
      msg: '抓取数据失败',
    })
  })
  res.json({
    code: 200,
    item: data,
    msg: '请求成功',
  })
})

// 趋势
app.get('/', async (req, res) => {
  var { language = 'any', since = 'any', nationality = 'any' } = req.query
  var data = await getTrending(language, since, nationality).catch((err) => {
    res.json({
      code: -1,
      err,
      msg: '抓取数据失败',
    })
  })
  res.json({
    code: 200,
    item: data,
    msg: '请求成功',
  })
})

app.listen(3000, () => {
  console.log('服务已运行')
})

3. util.js

// 类似 jQuery
const cheerio = require('cheerio')
// 基于原生 node 的 ajax 请求库
const request = require('superagent')

// 目标地址
const trending = `https://github.com/trending`

// 获取趋势数据
function getTrending(language, since, nationality) {
  return new Promise(async (resolve, reject) => {
    // 参数拼接 -- start
    var queryData = {}
    var localTreding = trending
    if (language !== 'any') {
      localTreding = `${trending}/${language}`
    }
    if (since !== 'any') {
      queryData['since'] = since
    }
    if (nationality !== 'any') {
      queryData['spoken_language_code'] = nationality
    }
    // 参数拼接 -- end

    // html 请求解析
    const html = await request.get(localTreding).query(queryData)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('.Box-row').each((index, item) => {
      const title = $(item).find('h1 > a').text().split('/')[1].trim()
      const author = $(item).find('h1 > a').text().split('/')[0].trim()
      const discription = $(item).find('p').text().trim()
      const language = $(item)
        .find('div [itemprop="programmingLanguage"]')
        .text()
      const star = $(item)
        .find('.d-inline-block.float-sm-right')
        .text()
        .match(/\d+/)[0]
      const avatarUrl = $(item).find('img.avatar').attr('src')
      const url = 'https://github.com' + $(item).find('h1 > a').attr('href')

      list.push({
        title,
        author,
        avatarUrl,
        discription,
        language,
        star,
    url
      })
    })
    list.length > 0 ? resolve(list) : reject('null')
  })
}

// 国家
function getNationality() {
  return new Promise(async (resolve, reject) => {
    const html = await request.get(trending)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('[data-filterable-for="text-filter-field-spoken-language"] a').each(
      (index, item) => {
        const key = $(item).find('> span').text().trim()
        const value = $(item).attr('href').split('=')[1]

        list.push({
          key,
          value,
        })
      }
    )
    list.length > 0 ? resolve(list) : reject('null')
  })
}

// 编程语言
function getLanguage() {
  return new Promise(async (resolve, reject) => {
    const html = await request.get(trending)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('#languages-menuitems > div > a').each((index, item) => {
      const key = $(item).find('> span').text().trim()
      const value = $(item)
        .attr('href')
        .match(/[^\/]+(\s\S)*(?=\?)/)[0]
      list.push({
        key,
        value,
      })
    })
    list.length > 0 ? resolve(list) : reject('null')
  })
}

// 时间
function getDate() {
  return new Promise(async (resolve, reject) => {
    const html = await request.get(trending)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('.select-menu-list > a.select-menu-item').each((index, item) => {
      const key = $(item).find('> span').text().trim()
      const value = $(item).attr('href').split('=')[1]
      list.push({
        key,
        value,
      })
    })
    list.length > 0 ? resolve(list) : reject('null')
  })
}

module.exports = {
  getDate,
  getLanguage,
  getNationality,
  getTrending,
}

4. 参考图

  1. index
index.png
  1. nationality
nationality.png
  1. language

    language.png
  2. date

date.png

常用模块

  1. cheerio 类似于 jQuery

    参考文档:https://github.com/cheeriojs/cheerio

  2. puppeteer 没用过

    参考文档:https://github.com/puppeteer/puppeteer

  3. node-schedule 定时任务

    参考文档:https://github.com/node-schedule/node-schedule

  4. superagent 请求库

    参考文档:https://www.jianshu.com/p/1432e0f29abd

  5. axios 请求库

    参考文档:https://github.com/axios/axios

留言

暂无评论

我要发表看法