12
« on: July 17, 2014, 09:01:36 PM »
Scan music video artists, this script will extract all artist names to an xml from the musicvideos.xml generated by the above script ScanMusicVideos.py, make sure there is a hyphen between artist and song name.
#-------------------------------------------------------------------------------
# Name: Extract elements from XML
# Purpose:
#
# Author: xtermin8r
#
# Created: 10/06/2014
# Copyright: (c) xtermin8r 2014
# Licence: <your licence>
#-------------------------------------------------------------------------------
import urllib, urllib2
import re
import os
from re import findall
from xml.etree import ElementTree as ET
txtfile = open("E:\\Appz\\Vox\\payloads\\artist_videos.txt", "wb")
xmlin = "E:\Appz\Vox\Payloads\musicvideos.xml"
xmlout = "E:\\Appz\\Vox\\payloads\\ArtistVideos.xml"
#function: get rid of unwanted characters
def cfix(zx):
f2 = re.sub( '\s+', ' ', zx).strip()
#Remove Digits
#f2 = re.sub(r"(^|\W)\d+", "", f2)
f2 = f2.replace('xDR;', ' ')
f2 = f2.replace(" -", " ")
f2 = f2.replace("- ", " ")
f2 = f2.replace(" - ", " ")
f2 = f2.replace("-", " ")
f2 = f2.replace(":", " ")
f2 = f2.replace("_", " ")
f2 = f2.replace("5.1", "")
f2 = f2.replace(".", " ")
f2 = f2.replace("!", " ")
f2 = f2.replace("?", " ")
f2 = f2.replace("@", " ")
f2 = f2.replace("–", "-")
f2 = f2.replace("1080p", " ")
f2 = f2.replace("720p", " ")
f2 = f2.replace("720P", " ")
f2 = f2.replace("x264", " ")
f2 = f2.replace("ac3", " ")
f2 = f2.replace("AC3", " ")
f2 = f2.replace("DTS", " ")
f2 = f2.replace("DVD", " ")
f2 = f2.replace("DvD", " ")
f2 = f2.replace("VOB", " ")
f2 = f2.replace("rip", " ")
f2 = f2.replace("aac", " ")
f2 = f2.replace("Rip", " ")
f2 = f2.replace("hd", " ")
f2 = f2.replace("HD", " ")
f2 = f2.replace("Hd", " ")
f2 = f2.replace("Hq", " ")
f2 = f2.replace("BluRay", " ")
f2 = f2.replace("Blu Ray", " ")
f2 = f2.replace("1080i", " ")
f2 = f2.replace("H264", " ")
f2 = f2.replace("h264", " ")
f2 = f2.replace("(", " ")
f2 = f2.replace(")", " ")
f2 = f2.replace("]", " ")
f2 = f2.replace("[", " ")
f2 = f2.replace("AlbumArtSmall", " ")
f2 = f2.replace("folder", " ")
f2 = f2.replace("AlbumArt", " ")
f2 = re.sub( '\s+', ' ', f2).strip()
global x
x = f2
return x
tree = ET.parse(xmlin)
root = tree.getroot()
#create the root </PayloadsRoot> <PayloadsRoot>
root_element = ET.Element("PayloadsRoot")
global t1
t1 = ""
n1 = 1
for data in root.findall('payload'):
phrase = data.find('phrase').text
value = data.find('value').text
head, tail = os.path.split(value)
value = ('.').join(tail.split('.')[:-1])
#remove beginning digits
value = re.sub('^[0-9]+.', '', value)
#Remove empty lines
#split before hyphen
value = value.split('-', 1)[0]
cfix(value)
value = x
if (value.lower() != t1.lower()):
if t1 != value:
if (value != ""):
#remove beginning digits
artist = re.sub('^[0-9]+.', '', value)
#create the first subelemet <payload> </payload>
pay_element = ET.SubElement(root_element, "payload")
#create the first child <value> </value>
child = ET.SubElement(pay_element, "value")
child.text = str(n1)
#create the second child <phrase> </phrase>
child = ET.Element("phrase")
child.text = artist
#now append
pay_element.append(child)
child = ET.Element("subsetmatching")
child.text = "true"
#now append
pay_element.append(child)
n1 = n1 + 1
#txtfile.write(str(artist) + "\n")
txtfile.write(artist + "\n")
#print(value)
t1 = artist
txtfile.close()
def indent(elem, level=0):
i = "\n" + level*'\t'
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + '\t'
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
indent(root_element)
#View xml tree
print ( ET.tostring(root_element) )
output_file = open(xmlout, 'w' )
output_file.write( '<?xml version="1.0" encoding="utf-8"?>'+'\n' )
output_file.write( '<!--A VoxCommando Payload file-->' +'\n' )
output_file.write( ET.tostring(root_element) )
output_file.close()