README.md · main · kfc50/xhs-video

kfc50/xhs-video

Public

WeChat Login

Code Issues Pull requests Events Packages Insights

main

更新代码

PreviewCode viewBlame


# 1. 安装Python3 (如果需要)
sudo apt update && sudo apt install -y python3 python3-pip python3-venv

# 2. 进入项目目录
cd /workspace

# 3. 使用UV创建虚拟环境
uv venv --python 3.11

# 4. 激活虚拟环境
source .venv/bin/activate

# 5. 安装依赖
uv pip install -r requirements.txt
uv pip install -e .

# 6. 运行应用
python app.py

originVideoKey 定位方法和完整JSON

定位方法代码片段


import requests
import re
import json

def extract_origin_video_key(url):
    """提取小红书视频的originVideoKey"""
    # 使用手机版User Agent（关键！）
    mobile_headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
    }
    
    r = requests.get(url, headers=mobile_headers, timeout=30)
    
    if r.status_code == 200:
        # 方法1: 直接匹配 originVideoKey
        pattern = r'"originVideoKey"\s*:\s*"([^"]+)"'
        match = re.search(pattern, r.text)
        
        if match:
            origin_video_key = match.group(1)
            # 替换转义的Unicode
            origin_video_key = origin_video_key.replace('\\u002F', '/')
            
            # 构造无水印视频URL
            url_without_watermark = f"http://sns-video-bd.xhscdn.com/{origin_video_key}"
            
            return {
                "originVideoKey": origin_video_key,
                "url_without_watermark": url_without_watermark
            }
    
    return None

# 使用示例
result = extract_origin_video_key("http://xhslink.com/o/AEc2QuZgPnW")
print(result)

完整JSON结构示例

originVideoKey 位于HTML中的 video.consumer 对象内，完整结构如下：


{
  "video": {
    "media": {
      "stream": {
        "h264": [
          {
            "width": 1280,
            "videoCodec": "h264",
            "videoBitrate": 478960,
            "audioChannels": 2,
            "vmaf": -1,
            "streamDesc": "MINI_APP_259",
            "defaultStream": 0,
            "avgBitrate": 547696,
            "streamType": 259,
            "audioDuration": 133000,
            "rotate": 0,
            "weight": 62,
            "videoDuration": 133000,
            "masterUrl": "http://sns-video-hw.xhscdn.com/stream/79/110/259/01e9169fca2b5717010370039a8060f17b_259.mp4",
            "backupUrls": [
              "http://sns-bak-v1.xhscdn.com/stream/79/110/259/01e9169fca2b5717010370039a8060f17b_259.mp4",
              "http://sns-bak-v6.xhscdn.com/stream/79/110/259/01e9169fca2b5717010370039a8060f17b_259.mp4"
            ],
            "ssim": 0,
            "format": "mp4",
            "audioCodec": "aac",
            "height": 720,
            "duration": 133000,
            "volume": 0,
            "psnr": 0,
            "size": 9105458,
            "fps": 24,
            "audioBitrate": 64057,
            "hdrType": 0,
            "qualityType": "HD"
          }
        ],
        "h265": [
          {
            "audioDuration": 133000,
            "masterUrl": "http://sns-video-hw.xhscdn.com/stream/79/110/114/01e9169fca2b57174f0370019a806214ac_114.mp4",
            "psnr": 50.132999420166016,
            "qualityType": "HD",
            "width": 1280,
            "volume": 0,
            "avgBitrate": 430781,
            "vmaf": -1,
            "defaultStream": 0,
            "size": 7161742,
            "backupUrls": [
              "http://sns-bak-v1.xhscdn.com/stream/79/110/114/01e9169fca2b57174f0370019a806214ac_114.mp4",
              "http://sns-bak-v6.xhscdn.com/stream/79/110/114/01e9169fca2b57174f0370019a806214ac_114.mp4"
            ],
            "streamDesc": "X265_MP4_WEB_114_h5",
            "videoDuration": 133000,
            "duration": 133000,
            "audioBitrate": 128115,
            "audioChannels": 2,
            "streamType": 114,
            "height": 720,
            "videoBitrate": 297904,
            "weight": 62,
            "fps": 24,
            "videoCodec": "hevc",
            "rotate": 0,
            "hdrType": 0,
            "format": "mp4",
            "ssim": 0,
            "audioCodec": "aac"
          }
        ],
        "h266": [],
        "av1": []
      },
      "videoId": 137666139158959890,
      "video": {
        "streamTypes": [259, 114],
        "bizName": 110,
        "bizId": "281781327238692381",
        "duration": 133,
        "md5": "8513e3b4c4e36cff57fe6078cd1b1aff",
        "hdrType": 0,
        "drmType": 0
      },
      "image": {
        "thumbnailFileid": "frame/110/0/01e9169fca2b57170010000000019a8060b37f_0.webp"
      },
      "capa": {
        "duration": 133
      },
      "consumer": {
        "originVideoKey": "spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458"
      }
    }
  }
}

关键点说明

必须使用手机版User Agent - PC版HTML中不包含originVideoKey
originVideoKey位置 - 在video.media.consumer.originVideoKey路径下
无水印视频URL构造 - http://sns-video-bd.xhscdn.com/{originVideoKey}
转义字符处理 - HTML中的\u002F需要替换为/

示例输出


{
  "originVideoKey": "spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458",
  "url_without_watermark": "http://sns-video-bd.xhscdn.com/spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458"
}

获取consumer对象的完整代码片段

从HTML中提取consumer对象


import requests
import re
import json

def get_consumer_object_from_html(url):
    """
    从HTML中提取consumer对象的完整代码片段

    步骤：
    1. 使用手机版User Agent获取HTML
    2. 查找originVideoKey的位置
    3. 向前查找"consumer": {
    4. 向后平衡查找对应的}
    5. 提取完整的consumer对象
    """
    # 步骤1: 使用手机版User Agent获取HTML
    mobile_headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
    }

    r = requests.get(url, headers=mobile_headers, timeout=30)

    if r.status_code != 200:
        return None

    # 步骤2: 查找originVideoKey的位置
    pattern = r'"originVideoKey"\s*:\s*"([^"]+)"'
    match = re.search(pattern, r.text)

    if not match:
        return None

    pos = match.start()

    # 步骤3: 向前查找 "consumer": {
    consumer_pattern = r'"consumer"\s*:\s*\{'

    # 从originVideoKey位置向前搜索（最多向前5000字符）
    search_start = max(0, pos - 5000)
    search_text = r.text[search_start:pos]
    consumer_matches = list(re.finditer(consumer_pattern, search_text))

    if not consumer_matches:
        return None

    # 取最后一个匹配（最接近originVideoKey的）
    consumer_match = consumer_matches[-1]
    json_start = search_start + consumer_match.start()

    # 步骤4: 向后查找对应的 }（平衡大括号）
    json_end = json_start + len(consumer_match.group())
    brace_count = 1
    while json_end < len(r.text) and brace_count > 0:
        if r.text[json_end] == '{':
            brace_count += 1
        elif r.text[json_end] == '}':
            brace_count -= 1
        json_end += 1

    # 步骤5: 提取完整的consumer对象
    consumer_json_str = r.text[json_start:json_end]

    # 解析JSON
    try:
        # 替换转义的Unicode
        fixed_json = consumer_json_str.replace('\\u002F', '/')
        # 去掉前面的 "consumer": 键
        if fixed_json.startswith('"consumer":'):
            fixed_json = fixed_json[11:]

        consumer_data = json.loads(fixed_json)

        return {
            "consumer_raw": consumer_json_str,  # HTML中的原始字符串
            "consumer_data": consumer_data,      # 解析后的JSON对象
            "originVideoKey": consumer_data.get("originVideoKey", "").replace('\\u002F', '/')
        }

    except Exception as e:
        print(f"无法解析JSON: {e}")
        return None

# 使用示例
result = get_consumer_object_from_html("http://xhslink.com/o/AEc2QuZgPnW")
if result:
    print("HTML中的原始consumer对象:")
    print(result['consumer_raw'])
    print("\n解析后的consumer对象:")
    print(json.dumps(result['consumer_data'], indent=2, ensure_ascii=False))
    print("\noriginVideoKey:", result['originVideoKey'])

输出示例


HTML中的原始consumer对象:
"consumer":{"originVideoKey":"spectrum\\u002F1040g35031orot7m514105q062k5lvrg2g3vh458"}

解析后的consumer对象:
{
  "originVideoKey": "spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458"
}

originVideoKey: spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458

代码说明

定位originVideoKey - 使用正则表达式"originVideoKey"\s*:\s*"([^"]+)"找到位置
向前搜索consumer - 从originVideoKey位置向前查找"consumer": {
平衡大括号 - 向后查找对应的闭合大括号}
处理转义字符 - 将\u002F替换为/
解析JSON - 去掉键名后解析为JSON对象

35/F,Tencent Building,Kejizhongyi Avenue,Nanshan District,Shenzhen

京ICP备11018762号-111