# 1. 安装Python3 (如果需要) sudo apt update && sudo apt install -y python3 python3-pip python3-venv # 2. 进入项目目录 cd /workspace # 3. 使用UV创建虚拟环境 uv venv --python 3.11 # 4. 激活虚拟环境 source .venv/bin/activate # 5. 安装依赖 uv pip install -r requirements.txt uv pip install -e . # 6. 运行应用 python app.py
import requests
import re
import json
def extract_origin_video_key(url):
"""提取小红书视频的originVideoKey"""
# 使用手机版User Agent(关键!)
mobile_headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
}
r = requests.get(url, headers=mobile_headers, timeout=30)
if r.status_code == 200:
# 方法1: 直接匹配 originVideoKey
pattern = r'"originVideoKey"\s*:\s*"([^"]+)"'
match = re.search(pattern, r.text)
if match:
origin_video_key = match.group(1)
# 替换转义的Unicode
origin_video_key = origin_video_key.replace('\\u002F', '/')
# 构造无水印视频URL
url_without_watermark = f"http://sns-video-bd.xhscdn.com/{origin_video_key}"
return {
"originVideoKey": origin_video_key,
"url_without_watermark": url_without_watermark
}
return None
# 使用示例
result = extract_origin_video_key("http://xhslink.com/o/AEc2QuZgPnW")
print(result)
originVideoKey 位于HTML中的 video.consumer 对象内,完整结构如下:
{
"video": {
"media": {
"stream": {
"h264": [
{
"width": 1280,
"videoCodec": "h264",
"videoBitrate": 478960,
"audioChannels": 2,
"vmaf": -1,
"streamDesc": "MINI_APP_259",
"defaultStream": 0,
"avgBitrate": 547696,
"streamType": 259,
"audioDuration": 133000,
"rotate": 0,
"weight": 62,
"videoDuration": 133000,
"masterUrl": "http://sns-video-hw.xhscdn.com/stream/79/110/259/01e9169fca2b5717010370039a8060f17b_259.mp4",
"backupUrls": [
"http://sns-bak-v1.xhscdn.com/stream/79/110/259/01e9169fca2b5717010370039a8060f17b_259.mp4",
"http://sns-bak-v6.xhscdn.com/stream/79/110/259/01e9169fca2b5717010370039a8060f17b_259.mp4"
],
"ssim": 0,
"format": "mp4",
"audioCodec": "aac",
"height": 720,
"duration": 133000,
"volume": 0,
"psnr": 0,
"size": 9105458,
"fps": 24,
"audioBitrate": 64057,
"hdrType": 0,
"qualityType": "HD"
}
],
"h265": [
{
"audioDuration": 133000,
"masterUrl": "http://sns-video-hw.xhscdn.com/stream/79/110/114/01e9169fca2b57174f0370019a806214ac_114.mp4",
"psnr": 50.132999420166016,
"qualityType": "HD",
"width": 1280,
"volume": 0,
"avgBitrate": 430781,
"vmaf": -1,
"defaultStream": 0,
"size": 7161742,
"backupUrls": [
"http://sns-bak-v1.xhscdn.com/stream/79/110/114/01e9169fca2b57174f0370019a806214ac_114.mp4",
"http://sns-bak-v6.xhscdn.com/stream/79/110/114/01e9169fca2b57174f0370019a806214ac_114.mp4"
],
"streamDesc": "X265_MP4_WEB_114_h5",
"videoDuration": 133000,
"duration": 133000,
"audioBitrate": 128115,
"audioChannels": 2,
"streamType": 114,
"height": 720,
"videoBitrate": 297904,
"weight": 62,
"fps": 24,
"videoCodec": "hevc",
"rotate": 0,
"hdrType": 0,
"format": "mp4",
"ssim": 0,
"audioCodec": "aac"
}
],
"h266": [],
"av1": []
},
"videoId": 137666139158959890,
"video": {
"streamTypes": [259, 114],
"bizName": 110,
"bizId": "281781327238692381",
"duration": 133,
"md5": "8513e3b4c4e36cff57fe6078cd1b1aff",
"hdrType": 0,
"drmType": 0
},
"image": {
"thumbnailFileid": "frame/110/0/01e9169fca2b57170010000000019a8060b37f_0.webp"
},
"capa": {
"duration": 133
},
"consumer": {
"originVideoKey": "spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458"
}
}
}
}
originVideoKeyvideo.media.consumer.originVideoKey路径下http://sns-video-bd.xhscdn.com/{originVideoKey}\u002F需要替换为/{ "originVideoKey": "spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458", "url_without_watermark": "http://sns-video-bd.xhscdn.com/spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458" }
import requests
import re
import json
def get_consumer_object_from_html(url):
"""
从HTML中提取consumer对象的完整代码片段
步骤:
1. 使用手机版User Agent获取HTML
2. 查找originVideoKey的位置
3. 向前查找"consumer": {
4. 向后平衡查找对应的}
5. 提取完整的consumer对象
"""
# 步骤1: 使用手机版User Agent获取HTML
mobile_headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
}
r = requests.get(url, headers=mobile_headers, timeout=30)
if r.status_code != 200:
return None
# 步骤2: 查找originVideoKey的位置
pattern = r'"originVideoKey"\s*:\s*"([^"]+)"'
match = re.search(pattern, r.text)
if not match:
return None
pos = match.start()
# 步骤3: 向前查找 "consumer": {
consumer_pattern = r'"consumer"\s*:\s*\{'
# 从originVideoKey位置向前搜索(最多向前5000字符)
search_start = max(0, pos - 5000)
search_text = r.text[search_start:pos]
consumer_matches = list(re.finditer(consumer_pattern, search_text))
if not consumer_matches:
return None
# 取最后一个匹配(最接近originVideoKey的)
consumer_match = consumer_matches[-1]
json_start = search_start + consumer_match.start()
# 步骤4: 向后查找对应的 }(平衡大括号)
json_end = json_start + len(consumer_match.group())
brace_count = 1
while json_end < len(r.text) and brace_count > 0:
if r.text[json_end] == '{':
brace_count += 1
elif r.text[json_end] == '}':
brace_count -= 1
json_end += 1
# 步骤5: 提取完整的consumer对象
consumer_json_str = r.text[json_start:json_end]
# 解析JSON
try:
# 替换转义的Unicode
fixed_json = consumer_json_str.replace('\\u002F', '/')
# 去掉前面的 "consumer": 键
if fixed_json.startswith('"consumer":'):
fixed_json = fixed_json[11:]
consumer_data = json.loads(fixed_json)
return {
"consumer_raw": consumer_json_str, # HTML中的原始字符串
"consumer_data": consumer_data, # 解析后的JSON对象
"originVideoKey": consumer_data.get("originVideoKey", "").replace('\\u002F', '/')
}
except Exception as e:
print(f"无法解析JSON: {e}")
return None
# 使用示例
result = get_consumer_object_from_html("http://xhslink.com/o/AEc2QuZgPnW")
if result:
print("HTML中的原始consumer对象:")
print(result['consumer_raw'])
print("\n解析后的consumer对象:")
print(json.dumps(result['consumer_data'], indent=2, ensure_ascii=False))
print("\noriginVideoKey:", result['originVideoKey'])
HTML中的原始consumer对象: "consumer":{"originVideoKey":"spectrum\\u002F1040g35031orot7m514105q062k5lvrg2g3vh458"} 解析后的consumer对象: { "originVideoKey": "spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458" } originVideoKey: spectrum/1040g35031orot7m514105q062k5lvrg2g3vh458
"originVideoKey"\s*:\s*"([^"]+)"找到位置"consumer": {}\u002F替换为/